Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/8967432 * create ocr_detection task * replace c++ nms with python version * replace c++ decoder with python version * add requirements for ocr_detectionmaster
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94 | |||||
| size 141149 | |||||
| @@ -28,6 +28,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { | |||||
| Tasks.image_generation: | Tasks.image_generation: | ||||
| ('person-image-cartoon', | ('person-image-cartoon', | ||||
| 'damo/cv_unet_person-image-cartoon_compound-models'), | 'damo/cv_unet_person-image-cartoon_compound-models'), | ||||
| Tasks.ocr_detection: ('ocr-detection', | |||||
| 'damo/cv_resnet18_ocr-detection-line-level_damo'), | |||||
| } | } | ||||
| @@ -1,2 +1,3 @@ | |||||
| from .image_cartoon_pipeline import ImageCartoonPipeline | from .image_cartoon_pipeline import ImageCartoonPipeline | ||||
| from .image_matting_pipeline import ImageMattingPipeline | from .image_matting_pipeline import ImageMattingPipeline | ||||
| from .ocr_detection_pipeline import OCRDetectionPipeline | |||||
| @@ -0,0 +1,167 @@ | |||||
| import math | |||||
| import os | |||||
| import os.path as osp | |||||
| import sys | |||||
| from typing import Any, Dict, List, Tuple, Union | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import PIL | |||||
| import tensorflow as tf | |||||
| import tf_slim as slim | |||||
| from modelscope.pipelines.base import Input | |||||
| from modelscope.preprocessors import load_image | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| from ..base import Pipeline | |||||
| from ..builder import PIPELINES | |||||
| from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils | |||||
| if tf.__version__ >= '2.0': | |||||
| tf = tf.compat.v1 | |||||
| tf.compat.v1.disable_eager_execution() | |||||
| logger = get_logger() | |||||
| # constant | |||||
| RBOX_DIM = 5 | |||||
| OFFSET_DIM = 6 | |||||
| WORD_POLYGON_DIM = 8 | |||||
| OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1] | |||||
| FLAGS = tf.app.flags.FLAGS | |||||
| tf.app.flags.DEFINE_float('node_threshold', 0.4, | |||||
| 'Confidence threshold for nodes') | |||||
| tf.app.flags.DEFINE_float('link_threshold', 0.6, | |||||
| 'Confidence threshold for links') | |||||
| @PIPELINES.register_module( | |||||
| Tasks.ocr_detection, module_name=Tasks.ocr_detection) | |||||
| class OCRDetectionPipeline(Pipeline): | |||||
| def __init__(self, model: str): | |||||
| super().__init__(model=model) | |||||
| model_path = osp.join( | |||||
| osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER), | |||||
| 'checkpoint-80000') | |||||
| config = tf.ConfigProto(allow_soft_placement=True) | |||||
| config.gpu_options.allow_growth = True | |||||
| self._session = tf.Session(config=config) | |||||
| global_step = tf.get_variable( | |||||
| 'global_step', [], | |||||
| initializer=tf.constant_initializer(0), | |||||
| dtype=tf.int64, | |||||
| trainable=False) | |||||
| variable_averages = tf.train.ExponentialMovingAverage( | |||||
| 0.997, global_step) | |||||
| self.input_images = tf.placeholder( | |||||
| tf.float32, shape=[1, 1024, 1024, 3], name='input_images') | |||||
| self.output = {} | |||||
| # detector | |||||
| detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector() | |||||
| all_maps = detector.build_model(self.input_images, is_training=False) | |||||
| # decode local predictions | |||||
| all_nodes, all_links, all_reg = [], [], [] | |||||
| for i, maps in enumerate(all_maps): | |||||
| cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2] | |||||
| reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE) | |||||
| cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2])) | |||||
| lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2]) | |||||
| lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:]) | |||||
| lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1) | |||||
| all_nodes.append(cls_prob) | |||||
| all_links.append(lnk_prob) | |||||
| all_reg.append(reg_maps) | |||||
| # decode segments and links | |||||
| image_size = tf.shape(self.input_images)[1:3] | |||||
| segments, group_indices, segment_counts, _ = ops.decode_segments_links_python( | |||||
| image_size, | |||||
| all_nodes, | |||||
| all_links, | |||||
| all_reg, | |||||
| anchor_sizes=list(detector.anchor_sizes)) | |||||
| # combine segments | |||||
| combined_rboxes, combined_counts = ops.combine_segments_python( | |||||
| segments, group_indices, segment_counts) | |||||
| self.output['combined_rboxes'] = combined_rboxes | |||||
| self.output['combined_counts'] = combined_counts | |||||
| with self._session.as_default() as sess: | |||||
| logger.info(f'loading model from {model_path}') | |||||
| # load model | |||||
| model_loader = tf.train.Saver( | |||||
| variable_averages.variables_to_restore()) | |||||
| model_loader.restore(sess, model_path) | |||||
| def preprocess(self, input: Input) -> Dict[str, Any]: | |||||
| if isinstance(input, str): | |||||
| img = np.array(load_image(input)) | |||||
| elif isinstance(input, PIL.Image.Image): | |||||
| img = np.array(input.convert('RGB')) | |||||
| elif isinstance(input, np.ndarray): | |||||
| if len(input.shape) == 2: | |||||
| img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) | |||||
| img = input[:, :, ::-1] # in rgb order | |||||
| else: | |||||
| raise TypeError(f'input should be either str, PIL.Image,' | |||||
| f' np.array, but got {type(input)}') | |||||
| h, w, c = img.shape | |||||
| img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32) | |||||
| img_pad[:h, :w, :] = img | |||||
| resize_size = 1024 | |||||
| img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size)) | |||||
| img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR) | |||||
| img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94], | |||||
| dtype=np.float32) | |||||
| resize_size = tf.stack([resize_size, resize_size]) | |||||
| orig_size = tf.stack([max(h, w), max(h, w)]) | |||||
| self.output['orig_size'] = orig_size | |||||
| self.output['resize_size'] = resize_size | |||||
| result = {'img': np.expand_dims(img_pad_resize, axis=0)} | |||||
| return result | |||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||||
| with self._session.as_default(): | |||||
| feed_dict = {self.input_images: input['img']} | |||||
| sess_outputs = self._session.run(self.output, feed_dict=feed_dict) | |||||
| return sess_outputs | |||||
| def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: | |||||
| rboxes = inputs['combined_rboxes'][0] | |||||
| count = inputs['combined_counts'][0] | |||||
| rboxes = rboxes[:count, :] | |||||
| # convert rboxes to polygons and find its coordinates on the original image | |||||
| orig_h, orig_w = inputs['orig_size'] | |||||
| resize_h, resize_w = inputs['resize_size'] | |||||
| polygons = utils.rboxes_to_polygons(rboxes) | |||||
| scale_y = float(orig_h) / float(resize_h) | |||||
| scale_x = float(orig_w) / float(resize_w) | |||||
| # confine polygons inside image | |||||
| polygons[:, ::2] = np.maximum( | |||||
| 0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1)) | |||||
| polygons[:, 1::2] = np.maximum( | |||||
| 0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1)) | |||||
| polygons = np.round(polygons).astype(np.int32) | |||||
| # nms | |||||
| dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()] | |||||
| dt_nms = utils.nms_python(dt_n9) | |||||
| dt_polygons = np.array([o[:8] for o in dt_nms]) | |||||
| result = {'det_polygons': dt_polygons} | |||||
| return result | |||||
| @@ -0,0 +1,158 @@ | |||||
| import tensorflow as tf | |||||
| import tf_slim as slim | |||||
| from . import ops, resnet18_v1, resnet_utils | |||||
| if tf.__version__ >= '2.0': | |||||
| tf = tf.compat.v1 | |||||
| # constants | |||||
| OFFSET_DIM = 6 | |||||
| N_LOCAL_LINKS = 8 | |||||
| N_CROSS_LINKS = 4 | |||||
| N_SEG_CLASSES = 2 | |||||
| N_LNK_CLASSES = 4 | |||||
| POS_LABEL = 1 | |||||
| NEG_LABEL = 0 | |||||
| class SegLinkDetector(): | |||||
| def __init__(self): | |||||
| self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.] | |||||
| def _detection_classifier(self, | |||||
| maps, | |||||
| ksize, | |||||
| weight_decay, | |||||
| cross_links=False, | |||||
| scope=None): | |||||
| with tf.variable_scope(scope): | |||||
| seg_depth = N_SEG_CLASSES | |||||
| if cross_links: | |||||
| lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS) | |||||
| else: | |||||
| lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS | |||||
| reg_depth = OFFSET_DIM | |||||
| map_depth = maps.get_shape()[3] | |||||
| inter_maps, inter_relu = ops.conv2d( | |||||
| maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter') | |||||
| dir_maps, dir_relu = ops.conv2d( | |||||
| inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir') | |||||
| cen_maps, cen_relu = ops.conv2d( | |||||
| inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen') | |||||
| pol_maps, pol_relu = ops.conv2d( | |||||
| inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol') | |||||
| concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1) | |||||
| _, lnk_embedding = ops.conv_relu( | |||||
| concat_relu, 12, 256, 1, 1, scope='lnk_embedding') | |||||
| lnk_maps, lnk_relu = ops.conv2d( | |||||
| inter_relu + lnk_embedding, | |||||
| 256, | |||||
| lnk_depth, | |||||
| ksize, | |||||
| 1, | |||||
| 'SAME', | |||||
| scope='conv_lnk') | |||||
| char_seg_maps, char_seg_relu = ops.conv2d( | |||||
| inter_relu, | |||||
| 256, | |||||
| seg_depth, | |||||
| ksize, | |||||
| 1, | |||||
| 'SAME', | |||||
| scope='conv_char_cls') | |||||
| char_reg_maps, char_reg_relu = ops.conv2d( | |||||
| inter_relu, | |||||
| 256, | |||||
| reg_depth, | |||||
| ksize, | |||||
| 1, | |||||
| 'SAME', | |||||
| scope='conv_char_reg') | |||||
| concat_char_relu = tf.concat([char_seg_relu, char_reg_relu], | |||||
| axis=-1) | |||||
| _, char_embedding = ops.conv_relu( | |||||
| concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding') | |||||
| seg_maps, seg_relu = ops.conv2d( | |||||
| inter_relu + char_embedding, | |||||
| 256, | |||||
| seg_depth, | |||||
| ksize, | |||||
| 1, | |||||
| 'SAME', | |||||
| scope='conv_cls') | |||||
| reg_maps, reg_relu = ops.conv2d( | |||||
| inter_relu + char_embedding, | |||||
| 256, | |||||
| reg_depth, | |||||
| ksize, | |||||
| 1, | |||||
| 'SAME', | |||||
| scope='conv_reg') | |||||
| return seg_relu, lnk_relu, reg_relu | |||||
| def _build_cnn(self, images, weight_decay, is_training): | |||||
| with slim.arg_scope( | |||||
| resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)): | |||||
| logits, end_points = resnet18_v1.resnet_v1_18( | |||||
| images, is_training=is_training, scope='resnet_v1_18') | |||||
| outputs = { | |||||
| 'conv3_3': end_points['pool1'], | |||||
| 'conv4_3': end_points['pool2'], | |||||
| 'fc7': end_points['pool3'], | |||||
| 'conv8_2': end_points['pool4'], | |||||
| 'conv9_2': end_points['pool5'], | |||||
| 'conv10_2': end_points['pool6'], | |||||
| } | |||||
| return outputs | |||||
| def build_model(self, images, is_training=True, scope=None): | |||||
| weight_decay = 5e-4 # FLAGS.weight_decay | |||||
| cnn_outputs = self._build_cnn(images, weight_decay, is_training) | |||||
| det_0 = self._detection_classifier( | |||||
| cnn_outputs['conv3_3'], | |||||
| 3, | |||||
| weight_decay, | |||||
| cross_links=False, | |||||
| scope='dete_0') | |||||
| det_1 = self._detection_classifier( | |||||
| cnn_outputs['conv4_3'], | |||||
| 3, | |||||
| weight_decay, | |||||
| cross_links=True, | |||||
| scope='dete_1') | |||||
| det_2 = self._detection_classifier( | |||||
| cnn_outputs['fc7'], | |||||
| 3, | |||||
| weight_decay, | |||||
| cross_links=True, | |||||
| scope='dete_2') | |||||
| det_3 = self._detection_classifier( | |||||
| cnn_outputs['conv8_2'], | |||||
| 3, | |||||
| weight_decay, | |||||
| cross_links=True, | |||||
| scope='dete_3') | |||||
| det_4 = self._detection_classifier( | |||||
| cnn_outputs['conv9_2'], | |||||
| 3, | |||||
| weight_decay, | |||||
| cross_links=True, | |||||
| scope='dete_4') | |||||
| det_5 = self._detection_classifier( | |||||
| cnn_outputs['conv10_2'], | |||||
| 3, | |||||
| weight_decay, | |||||
| cross_links=True, | |||||
| scope='dete_5') | |||||
| outputs = [det_0, det_1, det_2, det_3, det_4, det_5] | |||||
| return outputs | |||||
| @@ -0,0 +1,432 @@ | |||||
| """Contains definitions for the original form of Residual Networks. | |||||
| The 'v1' residual networks (ResNets) implemented in this module were proposed | |||||
| by: | |||||
| [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||||
| Deep Residual Learning for Image Recognition. arXiv:1512.03385 | |||||
| Other variants were introduced in: | |||||
| [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||||
| Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 | |||||
| The networks defined in this module utilize the bottleneck building block of | |||||
| [1] with projection shortcuts only for increasing depths. They employ batch | |||||
| normalization *after* every weight layer. This is the architecture used by | |||||
| MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and | |||||
| ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' | |||||
| architecture and the alternative 'v2' architecture of [2] which uses batch | |||||
| normalization *before* every weight layer in the so-called full pre-activation | |||||
| units. | |||||
| Typical use: | |||||
| from tensorflow.contrib.slim.nets import resnet_v1 | |||||
| ResNet-101 for image classification into 1000 classes: | |||||
| # inputs has shape [batch, 224, 224, 3] | |||||
| with slim.arg_scope(resnet_v1.resnet_arg_scope()): | |||||
| net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) | |||||
| ResNet-101 for semantic segmentation into 21 classes: | |||||
| # inputs has shape [batch, 513, 513, 3] | |||||
| with slim.arg_scope(resnet_v1.resnet_arg_scope()): | |||||
| net, end_points = resnet_v1.resnet_v1_101(inputs, | |||||
| 21, | |||||
| is_training=False, | |||||
| global_pool=False, | |||||
| output_stride=16) | |||||
| """ | |||||
| import tensorflow as tf | |||||
| import tf_slim as slim | |||||
| from . import resnet_utils | |||||
| if tf.__version__ >= '2.0': | |||||
| tf = tf.compat.v1 | |||||
| resnet_arg_scope = resnet_utils.resnet_arg_scope | |||||
| @slim.add_arg_scope | |||||
| def basicblock(inputs, | |||||
| depth, | |||||
| depth_bottleneck, | |||||
| stride, | |||||
| rate=1, | |||||
| outputs_collections=None, | |||||
| scope=None): | |||||
| """Bottleneck residual unit variant with BN after convolutions. | |||||
| This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for | |||||
| its definition. Note that we use here the bottleneck variant which has an | |||||
| extra bottleneck layer. | |||||
| When putting together two consecutive ResNet blocks that use this unit, one | |||||
| should use stride = 2 in the last unit of the first block. | |||||
| Args: | |||||
| inputs: A tensor of size [batch, height, width, channels]. | |||||
| depth: The depth of the ResNet unit output. | |||||
| depth_bottleneck: The depth of the bottleneck layers. | |||||
| stride: The ResNet unit's stride. Determines the amount of downsampling of | |||||
| the units output compared to its input. | |||||
| rate: An integer, rate for atrous convolution. | |||||
| outputs_collections: Collection to add the ResNet unit output. | |||||
| scope: Optional variable_scope. | |||||
| Returns: | |||||
| The ResNet unit's output. | |||||
| """ | |||||
| with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: | |||||
| depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) | |||||
| if depth == depth_in: | |||||
| shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') | |||||
| else: | |||||
| shortcut = slim.conv2d( | |||||
| inputs, | |||||
| depth, [1, 1], | |||||
| stride=stride, | |||||
| activation_fn=None, | |||||
| scope='shortcut') | |||||
| residual = resnet_utils.conv2d_same( | |||||
| inputs, depth, 3, stride, rate=rate, scope='conv1') | |||||
| residual = resnet_utils.conv2d_same( | |||||
| residual, depth, 3, 1, rate=rate, scope='conv2') | |||||
| output = tf.nn.relu(residual + shortcut) | |||||
| return slim.utils.collect_named_outputs(outputs_collections, | |||||
| sc.original_name_scope, output) | |||||
| @slim.add_arg_scope | |||||
| def bottleneck(inputs, | |||||
| depth, | |||||
| depth_bottleneck, | |||||
| stride, | |||||
| rate=1, | |||||
| outputs_collections=None, | |||||
| scope=None): | |||||
| """Bottleneck residual unit variant with BN after convolutions. | |||||
| This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for | |||||
| its definition. Note that we use here the bottleneck variant which has an | |||||
| extra bottleneck layer. | |||||
| When putting together two consecutive ResNet blocks that use this unit, one | |||||
| should use stride = 2 in the last unit of the first block. | |||||
| Args: | |||||
| inputs: A tensor of size [batch, height, width, channels]. | |||||
| depth: The depth of the ResNet unit output. | |||||
| depth_bottleneck: The depth of the bottleneck layers. | |||||
| stride: The ResNet unit's stride. Determines the amount of downsampling of | |||||
| the units output compared to its input. | |||||
| rate: An integer, rate for atrous convolution. | |||||
| outputs_collections: Collection to add the ResNet unit output. | |||||
| scope: Optional variable_scope. | |||||
| Returns: | |||||
| The ResNet unit's output. | |||||
| """ | |||||
| with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: | |||||
| depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) | |||||
| if depth == depth_in: | |||||
| shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') | |||||
| else: | |||||
| shortcut = slim.conv2d( | |||||
| inputs, | |||||
| depth, [1, 1], | |||||
| stride=stride, | |||||
| activation_fn=None, | |||||
| scope='shortcut') | |||||
| residual = slim.conv2d( | |||||
| inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1') | |||||
| residual = resnet_utils.conv2d_same( | |||||
| residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') | |||||
| residual = slim.conv2d( | |||||
| residual, | |||||
| depth, [1, 1], | |||||
| stride=1, | |||||
| activation_fn=None, | |||||
| scope='conv3') | |||||
| output = tf.nn.relu(shortcut + residual) | |||||
| return slim.utils.collect_named_outputs(outputs_collections, | |||||
| sc.original_name_scope, output) | |||||
| def resnet_v1(inputs, | |||||
| blocks, | |||||
| num_classes=None, | |||||
| is_training=True, | |||||
| global_pool=True, | |||||
| output_stride=None, | |||||
| include_root_block=True, | |||||
| spatial_squeeze=True, | |||||
| reuse=None, | |||||
| scope=None): | |||||
| """Generator for v1 ResNet models. | |||||
| This function generates a family of ResNet v1 models. See the resnet_v1_*() | |||||
| methods for specific model instantiations, obtained by selecting different | |||||
| block instantiations that produce ResNets of various depths. | |||||
| Training for image classification on Imagenet is usually done with [224, 224] | |||||
| inputs, resulting in [7, 7] feature maps at the output of the last ResNet | |||||
| block for the ResNets defined in [1] that have nominal stride equal to 32. | |||||
| However, for dense prediction tasks we advise that one uses inputs with | |||||
| spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In | |||||
| this case the feature maps at the ResNet output will have spatial shape | |||||
| [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] | |||||
| and corners exactly aligned with the input image corners, which greatly | |||||
| facilitates alignment of the features to the image. Using as input [225, 225] | |||||
| images results in [8, 8] feature maps at the output of the last ResNet block. | |||||
| For dense prediction tasks, the ResNet needs to run in fully-convolutional | |||||
| (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all | |||||
| have nominal stride equal to 32 and a good choice in FCN mode is to use | |||||
| output_stride=16 in order to increase the density of the computed features at | |||||
| small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. | |||||
| Args: | |||||
| inputs: A tensor of size [batch, height_in, width_in, channels]. | |||||
| blocks: A list of length equal to the number of ResNet blocks. Each element | |||||
| is a resnet_utils.Block object describing the units in the block. | |||||
| num_classes: Number of predicted classes for classification tasks. If None | |||||
| we return the features before the logit layer. | |||||
| is_training: whether is training or not. | |||||
| global_pool: If True, we perform global average pooling before computing the | |||||
| logits. Set to True for image classification, False for dense prediction. | |||||
| output_stride: If None, then the output will be computed at the nominal | |||||
| network stride. If output_stride is not None, it specifies the requested | |||||
| ratio of input to output spatial resolution. | |||||
| include_root_block: If True, include the initial convolution followed by | |||||
| max-pooling, if False excludes it. | |||||
| spatial_squeeze: if True, logits is of shape [B, C], if false logits is | |||||
| of shape [B, 1, 1, C], where B is batch_size and C is number of classes. | |||||
| reuse: whether or not the network and its variables should be reused. To be | |||||
| able to reuse 'scope' must be given. | |||||
| scope: Optional variable_scope. | |||||
| Returns: | |||||
| net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. | |||||
| If global_pool is False, then height_out and width_out are reduced by a | |||||
| factor of output_stride compared to the respective height_in and width_in, | |||||
| else both height_out and width_out equal one. If num_classes is None, then | |||||
| net is the output of the last ResNet block, potentially after global | |||||
| average pooling. If num_classes is not None, net contains the pre-softmax | |||||
| activations. | |||||
| end_points: A dictionary from components of the network to the corresponding | |||||
| activation. | |||||
| Raises: | |||||
| ValueError: If the target output_stride is not valid. | |||||
| """ | |||||
| with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: | |||||
| end_points_collection = sc.name + '_end_points' | |||||
| with slim.arg_scope( | |||||
| [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense], | |||||
| outputs_collections=end_points_collection): | |||||
| with slim.arg_scope([slim.batch_norm], is_training=is_training): | |||||
| net = inputs | |||||
| if include_root_block: | |||||
| if output_stride is not None: | |||||
| if output_stride % 4 != 0: | |||||
| raise ValueError( | |||||
| 'The output_stride needs to be a multiple of 4.' | |||||
| ) | |||||
| output_stride /= 4 | |||||
| net = resnet_utils.conv2d_same( | |||||
| net, 64, 7, stride=2, scope='conv1') | |||||
| net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]]) | |||||
| net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') | |||||
| net = slim.utils.collect_named_outputs( | |||||
| end_points_collection, 'pool2', net) | |||||
| net = resnet_utils.stack_blocks_dense(net, blocks, | |||||
| output_stride) | |||||
| end_points = slim.utils.convert_collection_to_dict( | |||||
| end_points_collection) | |||||
| end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2'] | |||||
| end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2'] | |||||
| end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2'] | |||||
| end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2'] | |||||
| end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2'] | |||||
| end_points['pool6'] = net | |||||
| return net, end_points | |||||
| resnet_v1.default_image_size = 224 | |||||
| def resnet_v1_18(inputs, | |||||
| num_classes=None, | |||||
| is_training=True, | |||||
| global_pool=True, | |||||
| output_stride=None, | |||||
| spatial_squeeze=True, | |||||
| reuse=None, | |||||
| scope='resnet_v1_18'): | |||||
| """ResNet-18 model of [1]. See resnet_v1() for arg and return description.""" | |||||
| blocks = [ | |||||
| resnet_utils.Block('block1', basicblock, | |||||
| [(64, 64, 1)] + [(64, 64, 1)]), | |||||
| resnet_utils.Block('block2', basicblock, | |||||
| [(128, 128, 1)] + [(128, 128, 1)]), | |||||
| resnet_utils.Block('block3', basicblock, | |||||
| [(256, 256, 2)] + [(256, 256, 1)]), | |||||
| resnet_utils.Block('block4', basicblock, | |||||
| [(512, 512, 2)] + [(512, 512, 1)]), | |||||
| resnet_utils.Block('block5', basicblock, | |||||
| [(256, 256, 2)] + [(256, 256, 1)]), | |||||
| resnet_utils.Block('block6', basicblock, | |||||
| [(256, 256, 2)] + [(256, 256, 1)]), | |||||
| resnet_utils.Block('block7', basicblock, | |||||
| [(256, 256, 2)] + [(256, 256, 1)]), | |||||
| ] | |||||
| return resnet_v1( | |||||
| inputs, | |||||
| blocks, | |||||
| num_classes, | |||||
| is_training, | |||||
| global_pool=global_pool, | |||||
| output_stride=output_stride, | |||||
| include_root_block=True, | |||||
| spatial_squeeze=spatial_squeeze, | |||||
| reuse=reuse, | |||||
| scope=scope) | |||||
| resnet_v1_18.default_image_size = resnet_v1.default_image_size | |||||
| def resnet_v1_50(inputs, | |||||
| num_classes=None, | |||||
| is_training=True, | |||||
| global_pool=True, | |||||
| output_stride=None, | |||||
| spatial_squeeze=True, | |||||
| reuse=None, | |||||
| scope='resnet_v1_50'): | |||||
| """ResNet-50 model of [1]. See resnet_v1() for arg and return description.""" | |||||
| blocks = [ | |||||
| resnet_utils.Block('block1', bottleneck, | |||||
| [(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||||
| resnet_utils.Block('block2', bottleneck, | |||||
| [(512, 128, 1)] * 3 + [(512, 128, 2)]), | |||||
| resnet_utils.Block('block3', bottleneck, | |||||
| [(1024, 256, 1)] * 5 + [(1024, 256, 2)]), | |||||
| resnet_utils.Block('block4', bottleneck, | |||||
| [(2048, 512, 1)] * 3 + [(2048, 512, 2)]), | |||||
| resnet_utils.Block('block5', bottleneck, | |||||
| [(1024, 256, 1)] * 2 + [(1024, 256, 2)]), | |||||
| resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2), | |||||
| ] | |||||
| return resnet_v1( | |||||
| inputs, | |||||
| blocks, | |||||
| num_classes, | |||||
| is_training, | |||||
| global_pool=global_pool, | |||||
| output_stride=output_stride, | |||||
| include_root_block=True, | |||||
| spatial_squeeze=spatial_squeeze, | |||||
| reuse=reuse, | |||||
| scope=scope) | |||||
| resnet_v1_50.default_image_size = resnet_v1.default_image_size | |||||
| def resnet_v1_101(inputs, | |||||
| num_classes=None, | |||||
| is_training=True, | |||||
| global_pool=True, | |||||
| output_stride=None, | |||||
| spatial_squeeze=True, | |||||
| reuse=None, | |||||
| scope='resnet_v1_101'): | |||||
| """ResNet-101 model of [1]. See resnet_v1() for arg and return description.""" | |||||
| blocks = [ | |||||
| resnet_utils.Block('block1', bottleneck, | |||||
| [(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||||
| resnet_utils.Block('block2', bottleneck, | |||||
| [(512, 128, 1)] * 3 + [(512, 128, 2)]), | |||||
| resnet_utils.Block('block3', bottleneck, | |||||
| [(1024, 256, 1)] * 22 + [(1024, 256, 2)]), | |||||
| resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) | |||||
| ] | |||||
| return resnet_v1( | |||||
| inputs, | |||||
| blocks, | |||||
| num_classes, | |||||
| is_training, | |||||
| global_pool=global_pool, | |||||
| output_stride=output_stride, | |||||
| include_root_block=True, | |||||
| spatial_squeeze=spatial_squeeze, | |||||
| reuse=reuse, | |||||
| scope=scope) | |||||
| resnet_v1_101.default_image_size = resnet_v1.default_image_size | |||||
| def resnet_v1_152(inputs, | |||||
| num_classes=None, | |||||
| is_training=True, | |||||
| global_pool=True, | |||||
| output_stride=None, | |||||
| spatial_squeeze=True, | |||||
| reuse=None, | |||||
| scope='resnet_v1_152'): | |||||
| """ResNet-152 model of [1]. See resnet_v1() for arg and return description.""" | |||||
| blocks = [ | |||||
| resnet_utils.Block('block1', bottleneck, | |||||
| [(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||||
| resnet_utils.Block('block2', bottleneck, | |||||
| [(512, 128, 1)] * 7 + [(512, 128, 2)]), | |||||
| resnet_utils.Block('block3', bottleneck, | |||||
| [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), | |||||
| resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) | |||||
| ] | |||||
| return resnet_v1( | |||||
| inputs, | |||||
| blocks, | |||||
| num_classes, | |||||
| is_training, | |||||
| global_pool=global_pool, | |||||
| output_stride=output_stride, | |||||
| include_root_block=True, | |||||
| spatial_squeeze=spatial_squeeze, | |||||
| reuse=reuse, | |||||
| scope=scope) | |||||
| resnet_v1_152.default_image_size = resnet_v1.default_image_size | |||||
| def resnet_v1_200(inputs, | |||||
| num_classes=None, | |||||
| is_training=True, | |||||
| global_pool=True, | |||||
| output_stride=None, | |||||
| spatial_squeeze=True, | |||||
| reuse=None, | |||||
| scope='resnet_v1_200'): | |||||
| """ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" | |||||
| blocks = [ | |||||
| resnet_utils.Block('block1', bottleneck, | |||||
| [(256, 64, 1)] * 2 + [(256, 64, 2)]), | |||||
| resnet_utils.Block('block2', bottleneck, | |||||
| [(512, 128, 1)] * 23 + [(512, 128, 2)]), | |||||
| resnet_utils.Block('block3', bottleneck, | |||||
| [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), | |||||
| resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3) | |||||
| ] | |||||
| return resnet_v1( | |||||
| inputs, | |||||
| blocks, | |||||
| num_classes, | |||||
| is_training, | |||||
| global_pool=global_pool, | |||||
| output_stride=output_stride, | |||||
| include_root_block=True, | |||||
| spatial_squeeze=spatial_squeeze, | |||||
| reuse=reuse, | |||||
| scope=scope) | |||||
| resnet_v1_200.default_image_size = resnet_v1.default_image_size | |||||
| if __name__ == '__main__': | |||||
| input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input') | |||||
| with slim.arg_scope(resnet_arg_scope()) as sc: | |||||
| logits = resnet_v1_50(input) | |||||
| @@ -0,0 +1,231 @@ | |||||
| """Contains building blocks for various versions of Residual Networks. | |||||
| Residual networks (ResNets) were proposed in: | |||||
| Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||||
| Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 | |||||
| More variants were introduced in: | |||||
| Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun | |||||
| Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 | |||||
| We can obtain different ResNet variants by changing the network depth, width, | |||||
| and form of residual unit. This module implements the infrastructure for | |||||
| building them. Concrete ResNet units and full ResNet networks are implemented in | |||||
| the accompanying resnet_v1.py and resnet_v2.py modules. | |||||
| Compared to https://github.com/KaimingHe/deep-residual-networks, in the current | |||||
| implementation we subsample the output activations in the last residual unit of | |||||
| each block, instead of subsampling the input activations in the first residual | |||||
| unit of each block. The two implementations give identical results but our | |||||
| implementation is more memory efficient. | |||||
| """ | |||||
| import collections | |||||
| import tensorflow as tf | |||||
| import tf_slim as slim | |||||
| if tf.__version__ >= '2.0': | |||||
| tf = tf.compat.v1 | |||||
| class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): | |||||
| """A named tuple describing a ResNet block. | |||||
| Its parts are: | |||||
| scope: The scope of the `Block`. | |||||
| unit_fn: The ResNet unit function which takes as input a `Tensor` and | |||||
| returns another `Tensor` with the output of the ResNet unit. | |||||
| args: A list of length equal to the number of units in the `Block`. The list | |||||
| contains one (depth, depth_bottleneck, stride) tuple for each unit in the | |||||
| block to serve as argument to unit_fn. | |||||
| """ | |||||
| def subsample(inputs, factor, scope=None): | |||||
| """Subsamples the input along the spatial dimensions. | |||||
| Args: | |||||
| inputs: A `Tensor` of size [batch, height_in, width_in, channels]. | |||||
| factor: The subsampling factor. | |||||
| scope: Optional variable_scope. | |||||
| Returns: | |||||
| output: A `Tensor` of size [batch, height_out, width_out, channels] with the | |||||
| input, either intact (if factor == 1) or subsampled (if factor > 1). | |||||
| """ | |||||
| if factor == 1: | |||||
| return inputs | |||||
| else: | |||||
| return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) | |||||
| def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): | |||||
| """Strided 2-D convolution with 'SAME' padding. | |||||
| When stride > 1, then we do explicit zero-padding, followed by conv2d with | |||||
| 'VALID' padding. | |||||
| Note that | |||||
| net = conv2d_same(inputs, num_outputs, 3, stride=stride) | |||||
| is equivalent to | |||||
| net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') | |||||
| net = subsample(net, factor=stride) | |||||
| whereas | |||||
| net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') | |||||
| is different when the input's height or width is even, which is why we add the | |||||
| current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). | |||||
| Args: | |||||
| inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. | |||||
| num_outputs: An integer, the number of output filters. | |||||
| kernel_size: An int with the kernel_size of the filters. | |||||
| stride: An integer, the output stride. | |||||
| rate: An integer, rate for atrous convolution. | |||||
| scope: Scope. | |||||
| Returns: | |||||
| output: A 4-D tensor of size [batch, height_out, width_out, channels] with | |||||
| the convolution output. | |||||
| """ | |||||
| if stride == 1: | |||||
| return slim.conv2d( | |||||
| inputs, | |||||
| num_outputs, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| rate=rate, | |||||
| padding='SAME', | |||||
| scope=scope) | |||||
| else: | |||||
| kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) | |||||
| pad_total = kernel_size_effective - 1 | |||||
| pad_beg = pad_total // 2 | |||||
| pad_end = pad_total - pad_beg | |||||
| inputs = tf.pad( | |||||
| inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) | |||||
| return slim.conv2d( | |||||
| inputs, | |||||
| num_outputs, | |||||
| kernel_size, | |||||
| stride=stride, | |||||
| rate=rate, | |||||
| padding='VALID', | |||||
| scope=scope) | |||||
| @slim.add_arg_scope | |||||
| def stack_blocks_dense(net, | |||||
| blocks, | |||||
| output_stride=None, | |||||
| outputs_collections=None): | |||||
| """Stacks ResNet `Blocks` and controls output feature density. | |||||
| First, this function creates scopes for the ResNet in the form of | |||||
| 'block_name/unit_1', 'block_name/unit_2', etc. | |||||
| Second, this function allows the user to explicitly control the ResNet | |||||
| output_stride, which is the ratio of the input to output spatial resolution. | |||||
| This is useful for dense prediction tasks such as semantic segmentation or | |||||
| object detection. | |||||
| Most ResNets consist of 4 ResNet blocks and subsample the activations by a | |||||
| factor of 2 when transitioning between consecutive ResNet blocks. This results | |||||
| to a nominal ResNet output_stride equal to 8. If we set the output_stride to | |||||
| half the nominal network stride (e.g., output_stride=4), then we compute | |||||
| responses twice. | |||||
| Control of the output feature density is implemented by atrous convolution. | |||||
| Args: | |||||
| net: A `Tensor` of size [batch, height, width, channels]. | |||||
| blocks: A list of length equal to the number of ResNet `Blocks`. Each | |||||
| element is a ResNet `Block` object describing the units in the `Block`. | |||||
| output_stride: If `None`, then the output will be computed at the nominal | |||||
| network stride. If output_stride is not `None`, it specifies the requested | |||||
| ratio of input to output spatial resolution, which needs to be equal to | |||||
| the product of unit strides from the start up to some level of the ResNet. | |||||
| For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, | |||||
| then valid values for the output_stride are 1, 2, 6, 24 or None (which | |||||
| is equivalent to output_stride=24). | |||||
| outputs_collections: Collection to add the ResNet block outputs. | |||||
| Returns: | |||||
| net: Output tensor with stride equal to the specified output_stride. | |||||
| Raises: | |||||
| ValueError: If the target output_stride is not valid. | |||||
| """ | |||||
| # The current_stride variable keeps track of the effective stride of the | |||||
| # activations. This allows us to invoke atrous convolution whenever applying | |||||
| # the next residual unit would result in the activations having stride larger | |||||
| # than the target output_stride. | |||||
| current_stride = 1 | |||||
| # The atrous convolution rate parameter. | |||||
| rate = 1 | |||||
| for block in blocks: | |||||
| with tf.variable_scope(block.scope, 'block', [net]): | |||||
| for i, unit in enumerate(block.args): | |||||
| if output_stride is not None and current_stride > output_stride: | |||||
| raise ValueError( | |||||
| 'The target output_stride cannot be reached.') | |||||
| with tf.variable_scope( | |||||
| 'unit_%d' % (i + 1), values=[net]) as sc: | |||||
| unit_depth, unit_depth_bottleneck, unit_stride = unit | |||||
| # If we have reached the target output_stride, then we need to employ | |||||
| # atrous convolution with stride=1 and multiply the atrous rate by the | |||||
| # current unit's stride for use in subsequent layers. | |||||
| if output_stride is not None and current_stride == output_stride: | |||||
| net = block.unit_fn( | |||||
| net, | |||||
| depth=unit_depth, | |||||
| depth_bottleneck=unit_depth_bottleneck, | |||||
| stride=1, | |||||
| rate=rate) | |||||
| rate *= unit_stride | |||||
| else: | |||||
| net = block.unit_fn( | |||||
| net, | |||||
| depth=unit_depth, | |||||
| depth_bottleneck=unit_depth_bottleneck, | |||||
| stride=unit_stride, | |||||
| rate=1) | |||||
| current_stride *= unit_stride | |||||
| net = slim.utils.collect_named_outputs( | |||||
| outputs_collections, sc.name, net) | |||||
| if output_stride is not None and current_stride != output_stride: | |||||
| raise ValueError('The target output_stride cannot be reached.') | |||||
| return net | |||||
| def resnet_arg_scope(weight_decay=0.0001, | |||||
| batch_norm_decay=0.997, | |||||
| batch_norm_epsilon=1e-5, | |||||
| batch_norm_scale=True): | |||||
| """Defines the default ResNet arg scope. | |||||
| TODO(gpapan): The batch-normalization related default values above are | |||||
| appropriate for use in conjunction with the reference ResNet models | |||||
| released at https://github.com/KaimingHe/deep-residual-networks. When | |||||
| training ResNets from scratch, they might need to be tuned. | |||||
| Args: | |||||
| weight_decay: The weight decay to use for regularizing the model. | |||||
| batch_norm_decay: The moving average decay when estimating layer activation | |||||
| statistics in batch normalization. | |||||
| batch_norm_epsilon: Small constant to prevent division by zero when | |||||
| normalizing activations by their variance in batch normalization. | |||||
| batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the | |||||
| activations in the batch normalization layer. | |||||
| Returns: | |||||
| An `arg_scope` to use for the resnet models. | |||||
| """ | |||||
| batch_norm_params = { | |||||
| 'decay': batch_norm_decay, | |||||
| 'epsilon': batch_norm_epsilon, | |||||
| 'scale': batch_norm_scale, | |||||
| 'updates_collections': tf.GraphKeys.UPDATE_OPS, | |||||
| } | |||||
| with slim.arg_scope( | |||||
| [slim.conv2d], | |||||
| weights_regularizer=slim.l2_regularizer(weight_decay), | |||||
| weights_initializer=slim.variance_scaling_initializer(), | |||||
| activation_fn=tf.nn.relu, | |||||
| normalizer_fn=slim.batch_norm, | |||||
| normalizer_params=batch_norm_params): | |||||
| with slim.arg_scope([slim.batch_norm], **batch_norm_params): | |||||
| # The following implies padding='SAME' for pool1, which makes feature | |||||
| # alignment easier for dense prediction tasks. This is also used in | |||||
| # https://github.com/facebook/fb.resnet.torch. However the accompanying | |||||
| # code of 'Deep Residual Learning for Image Recognition' uses | |||||
| # padding='VALID' for pool1. You can switch to that choice by setting | |||||
| # slim.arg_scope([slim.max_pool2d], padding='VALID'). | |||||
| with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: | |||||
| return arg_sc | |||||
| @@ -0,0 +1,108 @@ | |||||
| import cv2 | |||||
| import numpy as np | |||||
| def rboxes_to_polygons(rboxes): | |||||
| """ | |||||
| Convert rboxes to polygons | |||||
| ARGS | |||||
| `rboxes`: [n, 5] | |||||
| RETURN | |||||
| `polygons`: [n, 8] | |||||
| """ | |||||
| theta = rboxes[:, 4:5] | |||||
| cxcy = rboxes[:, :2] | |||||
| half_w = rboxes[:, 2:3] / 2. | |||||
| half_h = rboxes[:, 3:4] / 2. | |||||
| v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w]) | |||||
| v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h]) | |||||
| p1 = cxcy - v1 - v2 | |||||
| p2 = cxcy + v1 - v2 | |||||
| p3 = cxcy + v1 + v2 | |||||
| p4 = cxcy - v1 + v2 | |||||
| polygons = np.hstack([p1, p2, p3, p4]) | |||||
| return polygons | |||||
| def cal_width(box): | |||||
| pd1 = point_dist(box[0], box[1], box[2], box[3]) | |||||
| pd2 = point_dist(box[4], box[5], box[6], box[7]) | |||||
| return (pd1 + pd2) / 2 | |||||
| def point_dist(x1, y1, x2, y2): | |||||
| return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1)) | |||||
| def draw_polygons(img, polygons): | |||||
| for p in polygons.tolist(): | |||||
| p = [int(o) for o in p] | |||||
| cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1) | |||||
| cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1) | |||||
| cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1) | |||||
| cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1) | |||||
| return img | |||||
| def nms_python(boxes): | |||||
| boxes = sorted(boxes, key=lambda x: -x[8]) | |||||
| nms_flag = [True] * len(boxes) | |||||
| for i, a in enumerate(boxes): | |||||
| if not nms_flag[i]: | |||||
| continue | |||||
| else: | |||||
| for j, b in enumerate(boxes): | |||||
| if not j > i: | |||||
| continue | |||||
| if not nms_flag[j]: | |||||
| continue | |||||
| score_a = a[8] | |||||
| score_b = b[8] | |||||
| rbox_a = polygon2rbox(a[:8]) | |||||
| rbox_b = polygon2rbox(b[:8]) | |||||
| if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox( | |||||
| rbox_b[:2], rbox_a): | |||||
| if score_a > score_b: | |||||
| nms_flag[j] = False | |||||
| boxes_nms = [] | |||||
| for i, box in enumerate(boxes): | |||||
| if nms_flag[i]: | |||||
| boxes_nms.append(box) | |||||
| return boxes_nms | |||||
| def point_in_rbox(c, rbox): | |||||
| cx0, cy0 = c[0], c[1] | |||||
| cx1, cy1 = rbox[0], rbox[1] | |||||
| w, h = rbox[2], rbox[3] | |||||
| theta = rbox[4] | |||||
| dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta)) | |||||
| dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta)) | |||||
| return ((dist_x < w / 2.0) and (dist_y < h / 2.0)) | |||||
| def polygon2rbox(polygon): | |||||
| x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6] | |||||
| y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7] | |||||
| c_x = (x1 + x2 + x3 + x4) / 4 | |||||
| c_y = (y1 + y2 + y3 + y4) / 4 | |||||
| w1 = point_dist(x1, y1, x2, y2) | |||||
| w2 = point_dist(x3, y3, x4, y4) | |||||
| h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2) | |||||
| h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4) | |||||
| h = h1 + h2 | |||||
| w = (w1 + w2) / 2 | |||||
| theta1 = np.arctan2(y2 - y1, x2 - x1) | |||||
| theta2 = np.arctan2(y3 - y4, x3 - x4) | |||||
| theta = (theta1 + theta2) / 2.0 | |||||
| return [c_x, c_y, w, h, theta] | |||||
| def point_line_dist(px, py, x1, y1, x2, y2): | |||||
| eps = 1e-6 | |||||
| dx = x2 - x1 | |||||
| dy = y2 - y1 | |||||
| div = np.sqrt(dx * dx + dy * dy) + eps | |||||
| dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div | |||||
| return dist | |||||
| @@ -54,6 +54,13 @@ TASK_OUTPUTS = { | |||||
| # } | # } | ||||
| Tasks.pose_estimation: ['poses', 'boxes'], | Tasks.pose_estimation: ['poses', 'boxes'], | ||||
| # ocr detection result for single sample | |||||
| # { | |||||
| # "det_polygons": np.array with shape [num_text, 8], each box is | |||||
| # [x1, y1, x2, y2, x3, y3, x4, y4] | |||||
| # } | |||||
| Tasks.ocr_detection: ['det_polygons'], | |||||
| # ============ nlp tasks =================== | # ============ nlp tasks =================== | ||||
| # text classification result for single sample | # text classification result for single sample | ||||
| @@ -28,6 +28,7 @@ class Tasks(object): | |||||
| image_editing = 'image-editing' | image_editing = 'image-editing' | ||||
| image_generation = 'image-generation' | image_generation = 'image-generation' | ||||
| image_matting = 'image-matting' | image_matting = 'image-matting' | ||||
| ocr_detection = 'ocr-detection' | |||||
| # nlp tasks | # nlp tasks | ||||
| word_segmentation = 'word-segmentation' | word_segmentation = 'word-segmentation' | ||||
| @@ -1 +1,2 @@ | |||||
| easydict | easydict | ||||
| tf_slim | |||||
| @@ -0,0 +1,37 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os.path as osp | |||||
| import shutil | |||||
| import sys | |||||
| import tempfile | |||||
| import unittest | |||||
| from typing import Any, Dict, List, Tuple, Union | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import PIL | |||||
| from modelscope.pipelines import pipeline | |||||
| from modelscope.pipelines.base import Pipeline | |||||
| from modelscope.utils.constant import Tasks | |||||
| from modelscope.utils.test_utils import test_level | |||||
| class OCRDetectionTest(unittest.TestCase): | |||||
| def setUp(self) -> None: | |||||
| self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo' | |||||
| self.test_image = 'data/test/images/ocr_detection.jpg' | |||||
| def pipeline_inference(self, pipeline: Pipeline, input_location: str): | |||||
| result = pipeline(input_location) | |||||
| print('ocr detection results: ') | |||||
| print(result) | |||||
| @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') | |||||
| def test_run_modelhub_default_model(self): | |||||
| ocr_detection = pipeline(Tasks.ocr_detection) | |||||
| self.pipeline_inference(ocr_detection, self.test_image) | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||