# Conflicts: # modelscope/models/multi_modal/ofa/utils/__init__.pymaster
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:712b5525e37080d33f62d6657609dbef20e843ccc04ee5c788ea11aa7c08545e | |||||
| size 123341 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:8fddc7be8381eb244cd692601f1c1e6cf3484b44bb4e73df0bc7de29352eb487 | |||||
| size 23889 | |||||
| @@ -0,0 +1,3 @@ | |||||
| version https://git-lfs.github.com/spec/v1 | |||||
| oid sha256:a16038f7809127eb3e03cbae049592d193707e095309daca78f7d108d67fe4ec | |||||
| size 108357 | |||||
| @@ -40,6 +40,9 @@ class Models(object): | |||||
| ulfd = 'ulfd' | ulfd = 'ulfd' | ||||
| video_inpainting = 'video-inpainting' | video_inpainting = 'video-inpainting' | ||||
| hand_static = 'hand-static' | hand_static = 'hand-static' | ||||
| face_human_hand_detection = 'face-human-hand-detection' | |||||
| face_emotion = 'face-emotion' | |||||
| product_segmentation = 'product-segmentation' | |||||
| # EasyCV models | # EasyCV models | ||||
| yolox = 'YOLOX' | yolox = 'YOLOX' | ||||
| @@ -179,9 +182,16 @@ class Pipelines(object): | |||||
| movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | ||||
| shop_segmentation = 'shop-segmentation' | shop_segmentation = 'shop-segmentation' | ||||
| video_inpainting = 'video-inpainting' | video_inpainting = 'video-inpainting' | ||||
| pst_action_recognition = 'patchshift-action-recognition' | |||||
| hand_static = 'hand-static' | hand_static = 'hand-static' | ||||
| face_human_hand_detection = 'face-human-hand-detection' | |||||
| face_emotion = 'face-emotion' | |||||
| product_segmentation = 'product-segmentation' | |||||
| # nlp tasks | # nlp tasks | ||||
| automatic_post_editing = 'automatic-post-editing' | |||||
| translation_quality_estimation = 'translation-quality-estimation' | |||||
| domain_classification = 'domain-classification' | |||||
| sentence_similarity = 'sentence-similarity' | sentence_similarity = 'sentence-similarity' | ||||
| word_segmentation = 'word-segmentation' | word_segmentation = 'word-segmentation' | ||||
| part_of_speech = 'part-of-speech' | part_of_speech = 'part-of-speech' | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Part of the implementation is borrowed and modified from BasicSR, publicly available at | |||||
| # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py | |||||
| from typing import Dict | from typing import Dict | ||||
| import numpy as np | import numpy as np | ||||
| @@ -7,11 +7,13 @@ if TYPE_CHECKING: | |||||
| from .models import BaseVideoModel | from .models import BaseVideoModel | ||||
| from .tada_convnext import TadaConvNeXt | from .tada_convnext import TadaConvNeXt | ||||
| from .temporal_patch_shift_transformer import PatchShiftTransformer | |||||
| else: | else: | ||||
| _import_structure = { | _import_structure = { | ||||
| 'models': ['BaseVideoModel'], | 'models': ['BaseVideoModel'], | ||||
| 'tada_convnext': ['TadaConvNeXt'], | 'tada_convnext': ['TadaConvNeXt'], | ||||
| 'temporal_patch_shift_transformer': ['PatchShiftTransformer'] | |||||
| } | } | ||||
| import sys | import sys | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation. | |||||
| import os | import os | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation. | |||||
| cfg_128x128_15 = { | cfg_128x128_15 = { | ||||
| 'DATASET': { | 'DATASET': { | ||||
| 'TYPE': 'DAMO', | 'TYPE': 'DAMO', | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import logging | import logging | ||||
| import os.path as osp | import os.path as osp | ||||
| from typing import Any, Dict, List, Union | from typing import Any, Dict, List, Union | ||||
| @@ -1,4 +1,4 @@ | |||||
| # The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D | |||||
| # The implementation is based on VideoPose3D, available at https://github.com/facebookresearch/VideoPose3D | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||||
| import numpy as np | import numpy as np | ||||
| from modelscope.models.cv.cartoon.facelib.config import config as cfg | from modelscope.models.cv.cartoon.facelib.config import config as cfg | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||||
| import os | import os | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||||
| import time | import time | ||||
| import cv2 | import cv2 | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| import tensorflow as tf | import tensorflow as tf | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||||
| import time | import time | ||||
| import cv2 | import cv2 | ||||
| @@ -1,7 +1,5 @@ | |||||
| """ | |||||
| Created on Mon Apr 24 15:43:29 2017 | |||||
| @author: zhaoy | |||||
| """ | |||||
| # The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,8 +1,4 @@ | |||||
| """ | |||||
| Created on Tue Jul 11 06:54:28 2017 | |||||
| @author: zhaoyafei | |||||
| """ | |||||
| # The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch | |||||
| import numpy as np | import numpy as np | ||||
| from numpy.linalg import inv, lstsq | from numpy.linalg import inv, lstsq | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| import cv2 | import cv2 | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .models.detectors import MogFaceDetector | from .models.detectors import MogFaceDetector | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .models.detector import MtcnnFaceDetector | from .models.detector import MtcnnFaceDetector | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .detection import RetinaFaceDetection | from .detection import RetinaFaceDetection | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .detection import UlfdFaceDetector | from .detection import UlfdFaceDetector | ||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .emotion_model import EfficientNetForFaceEmotion | |||||
| else: | |||||
| _import_structure = {'emotion_model': ['EfficientNetForFaceEmotion']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,6 @@ | |||||
| # The implementation here is modified based on EfficientNet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch | |||||
| from .model import VALID_MODELS, EfficientNet | |||||
| from .utils import (BlockArgs, BlockDecoder, GlobalParams, efficientnet, | |||||
| get_model_params) | |||||
| @@ -0,0 +1,380 @@ | |||||
| # The implementation here is modified based on EfficientNet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.nn import functional as F | |||||
| from .utils import (MemoryEfficientSwish, Swish, calculate_output_image_size, | |||||
| drop_connect, efficientnet_params, get_model_params, | |||||
| get_same_padding_conv2d, load_pretrained_weights, | |||||
| round_filters, round_repeats) | |||||
| VALID_MODELS = ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', | |||||
| 'efficientnet-b3', 'efficientnet-b4', 'efficientnet-b5', | |||||
| 'efficientnet-b6', 'efficientnet-b7', 'efficientnet-b8', | |||||
| 'efficientnet-l2') | |||||
| class MBConvBlock(nn.Module): | |||||
| def __init__(self, block_args, global_params, image_size=None): | |||||
| super().__init__() | |||||
| self._block_args = block_args | |||||
| self._bn_mom = 1 - global_params.batch_norm_momentum | |||||
| self._bn_eps = global_params.batch_norm_epsilon | |||||
| self.has_se = (self._block_args.se_ratio | |||||
| is not None) and (0 < self._block_args.se_ratio <= 1) | |||||
| self.id_skip = block_args.id_skip | |||||
| inp = self._block_args.input_filters | |||||
| oup = self._block_args.input_filters * self._block_args.expand_ratio | |||||
| if self._block_args.expand_ratio != 1: | |||||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||||
| self._expand_conv = Conv2d( | |||||
| in_channels=inp, out_channels=oup, kernel_size=1, bias=False) | |||||
| self._bn0 = nn.BatchNorm2d( | |||||
| num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) | |||||
| k = self._block_args.kernel_size | |||||
| s = self._block_args.stride | |||||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||||
| self._depthwise_conv = Conv2d( | |||||
| in_channels=oup, | |||||
| out_channels=oup, | |||||
| groups=oup, | |||||
| kernel_size=k, | |||||
| stride=s, | |||||
| bias=False) | |||||
| self._bn1 = nn.BatchNorm2d( | |||||
| num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) | |||||
| image_size = calculate_output_image_size(image_size, s) | |||||
| if self.has_se: | |||||
| Conv2d = get_same_padding_conv2d(image_size=(1, 1)) | |||||
| num_squeezed_channels = max( | |||||
| 1, | |||||
| int(self._block_args.input_filters | |||||
| * self._block_args.se_ratio)) | |||||
| self._se_reduce = Conv2d( | |||||
| in_channels=oup, | |||||
| out_channels=num_squeezed_channels, | |||||
| kernel_size=1) | |||||
| self._se_expand = Conv2d( | |||||
| in_channels=num_squeezed_channels, | |||||
| out_channels=oup, | |||||
| kernel_size=1) | |||||
| final_oup = self._block_args.output_filters | |||||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||||
| self._project_conv = Conv2d( | |||||
| in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) | |||||
| self._bn2 = nn.BatchNorm2d( | |||||
| num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) | |||||
| self._swish = MemoryEfficientSwish() | |||||
| def forward(self, inputs, drop_connect_rate=None): | |||||
| """MBConvBlock's forward function. | |||||
| Args: | |||||
| inputs (tensor): Input tensor. | |||||
| drop_connect_rate (bool): Drop connect rate (float, between 0 and 1). | |||||
| Returns: | |||||
| Output of this block after processing. | |||||
| """ | |||||
| x = inputs | |||||
| if self._block_args.expand_ratio != 1: | |||||
| x = self._expand_conv(inputs) | |||||
| x = self._bn0(x) | |||||
| x = self._swish(x) | |||||
| x = self._depthwise_conv(x) | |||||
| x = self._bn1(x) | |||||
| x = self._swish(x) | |||||
| if self.has_se: | |||||
| x_squeezed = F.adaptive_avg_pool2d(x, 1) | |||||
| x_squeezed = self._se_reduce(x_squeezed) | |||||
| x_squeezed = self._swish(x_squeezed) | |||||
| x_squeezed = self._se_expand(x_squeezed) | |||||
| x = torch.sigmoid(x_squeezed) * x | |||||
| x = self._project_conv(x) | |||||
| x = self._bn2(x) | |||||
| input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters | |||||
| if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: | |||||
| if drop_connect_rate: | |||||
| x = drop_connect( | |||||
| x, p=drop_connect_rate, training=self.training) | |||||
| x = x + inputs | |||||
| return x | |||||
| def set_swish(self, memory_efficient=True): | |||||
| """Sets swish function as memory efficient (for training) or standard (for export). | |||||
| Args: | |||||
| memory_efficient (bool): Whether to use memory-efficient version of swish. | |||||
| """ | |||||
| self._swish = MemoryEfficientSwish() if memory_efficient else Swish() | |||||
| class EfficientNet(nn.Module): | |||||
| """EfficientNet model. | |||||
| Most easily loaded with the .from_name or .from_pretrained methods. | |||||
| Args: | |||||
| blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks. | |||||
| global_params (namedtuple): A set of GlobalParams shared between blocks. | |||||
| References: | |||||
| [1] https://arxiv.org/abs/1905.11946 (EfficientNet) | |||||
| Example: | |||||
| >>> import torch | |||||
| >>> from efficientnet.model import EfficientNet | |||||
| >>> inputs = torch.rand(1, 3, 224, 224) | |||||
| >>> model = EfficientNet.from_pretrained('efficientnet-b0') | |||||
| >>> model.eval() | |||||
| >>> outputs = model(inputs) | |||||
| """ | |||||
| def __init__(self, blocks_args=None, global_params=None): | |||||
| super().__init__() | |||||
| assert isinstance(blocks_args, list), 'blocks_args should be a list' | |||||
| assert len(blocks_args) > 0, 'block args must be greater than 0' | |||||
| self._global_params = global_params | |||||
| self._blocks_args = blocks_args | |||||
| bn_mom = 1 - self._global_params.batch_norm_momentum | |||||
| bn_eps = self._global_params.batch_norm_epsilon | |||||
| image_size = global_params.image_size | |||||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||||
| in_channels = 3 | |||||
| out_channels = round_filters(32, self._global_params) | |||||
| self._conv_stem = Conv2d( | |||||
| in_channels, out_channels, kernel_size=3, stride=2, bias=False) | |||||
| self._bn0 = nn.BatchNorm2d( | |||||
| num_features=out_channels, momentum=bn_mom, eps=bn_eps) | |||||
| image_size = calculate_output_image_size(image_size, 2) | |||||
| self._blocks = nn.ModuleList([]) | |||||
| for block_args in self._blocks_args: | |||||
| block_args = block_args._replace( | |||||
| input_filters=round_filters(block_args.input_filters, | |||||
| self._global_params), | |||||
| output_filters=round_filters(block_args.output_filters, | |||||
| self._global_params), | |||||
| num_repeat=round_repeats(block_args.num_repeat, | |||||
| self._global_params)) | |||||
| self._blocks.append( | |||||
| MBConvBlock( | |||||
| block_args, self._global_params, image_size=image_size)) | |||||
| image_size = calculate_output_image_size(image_size, | |||||
| block_args.stride) | |||||
| if block_args.num_repeat > 1: | |||||
| block_args = block_args._replace( | |||||
| input_filters=block_args.output_filters, stride=1) | |||||
| for _ in range(block_args.num_repeat - 1): | |||||
| self._blocks.append( | |||||
| MBConvBlock( | |||||
| block_args, self._global_params, | |||||
| image_size=image_size)) | |||||
| in_channels = block_args.output_filters | |||||
| out_channels = round_filters(1280, self._global_params) | |||||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||||
| self._conv_head = Conv2d( | |||||
| in_channels, out_channels, kernel_size=1, bias=False) | |||||
| self._bn1 = nn.BatchNorm2d( | |||||
| num_features=out_channels, momentum=bn_mom, eps=bn_eps) | |||||
| self._avg_pooling = nn.AdaptiveAvgPool2d(1) | |||||
| if self._global_params.include_top: | |||||
| self._dropout = nn.Dropout(self._global_params.dropout_rate) | |||||
| self._fc = nn.Linear(out_channels, self._global_params.num_classes) | |||||
| self._swish = MemoryEfficientSwish() | |||||
| def set_swish(self, memory_efficient=True): | |||||
| """Sets swish function as memory efficient (for training) or standard (for export). | |||||
| Args: | |||||
| memory_efficient (bool): Whether to use memory-efficient version of swish. | |||||
| """ | |||||
| self._swish = MemoryEfficientSwish() if memory_efficient else Swish() | |||||
| for block in self._blocks: | |||||
| block.set_swish(memory_efficient) | |||||
| def extract_endpoints(self, inputs): | |||||
| """Use convolution layer to extract features | |||||
| from reduction levels i in [1, 2, 3, 4, 5]. | |||||
| Args: | |||||
| inputs (tensor): Input tensor. | |||||
| Returns: | |||||
| Dictionary of last intermediate features | |||||
| with reduction levels i in [1, 2, 3, 4, 5]. | |||||
| Example: | |||||
| >>> import torch | |||||
| >>> from efficientnet.model import EfficientNet | |||||
| >>> inputs = torch.rand(1, 3, 224, 224) | |||||
| >>> model = EfficientNet.from_pretrained('efficientnet-b0') | |||||
| >>> endpoints = model.extract_endpoints(inputs) | |||||
| >>> print(endpoints['reduction_1'].shape) # torch.Size([1, 16, 112, 112]) | |||||
| >>> print(endpoints['reduction_2'].shape) # torch.Size([1, 24, 56, 56]) | |||||
| >>> print(endpoints['reduction_3'].shape) # torch.Size([1, 40, 28, 28]) | |||||
| >>> print(endpoints['reduction_4'].shape) # torch.Size([1, 112, 14, 14]) | |||||
| >>> print(endpoints['reduction_5'].shape) # torch.Size([1, 320, 7, 7]) | |||||
| >>> print(endpoints['reduction_6'].shape) # torch.Size([1, 1280, 7, 7]) | |||||
| """ | |||||
| endpoints = dict() | |||||
| x = self._swish(self._bn0(self._conv_stem(inputs))) | |||||
| prev_x = x | |||||
| for idx, block in enumerate(self._blocks): | |||||
| drop_connect_rate = self._global_params.drop_connect_rate | |||||
| if drop_connect_rate: | |||||
| drop_connect_rate *= float(idx) / len( | |||||
| self._blocks) # scale drop connect_rate | |||||
| x = block(x, drop_connect_rate=drop_connect_rate) | |||||
| if prev_x.size(2) > x.size(2): | |||||
| endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x | |||||
| elif idx == len(self._blocks) - 1: | |||||
| endpoints['reduction_{}'.format(len(endpoints) + 1)] = x | |||||
| prev_x = x | |||||
| x = self._swish(self._bn1(self._conv_head(x))) | |||||
| endpoints['reduction_{}'.format(len(endpoints) + 1)] = x | |||||
| return endpoints | |||||
| def extract_features(self, inputs): | |||||
| """use convolution layer to extract feature . | |||||
| Args: | |||||
| inputs (tensor): Input tensor. | |||||
| Returns: | |||||
| Output of the final convolution | |||||
| layer in the efficientnet model. | |||||
| """ | |||||
| x = self._swish(self._bn0(self._conv_stem(inputs))) | |||||
| for idx, block in enumerate(self._blocks): | |||||
| drop_connect_rate = self._global_params.drop_connect_rate | |||||
| if drop_connect_rate: | |||||
| drop_connect_rate *= float(idx) / len(self._blocks) | |||||
| x = block(x, drop_connect_rate=drop_connect_rate) | |||||
| x = self._swish(self._bn1(self._conv_head(x))) | |||||
| return x | |||||
| def forward(self, inputs): | |||||
| """EfficientNet's forward function. | |||||
| Calls extract_features to extract features, applies final linear layer, and returns logits. | |||||
| Args: | |||||
| inputs (tensor): Input tensor. | |||||
| Returns: | |||||
| Output of this model after processing. | |||||
| """ | |||||
| x = self.extract_features(inputs) | |||||
| x = self._avg_pooling(x) | |||||
| if self._global_params.include_top: | |||||
| x = x.flatten(start_dim=1) | |||||
| x = self._dropout(x) | |||||
| x = self._fc(x) | |||||
| return x | |||||
| @classmethod | |||||
| def from_name(cls, model_name, in_channels=3, **override_params): | |||||
| """Create an efficientnet model according to name. | |||||
| Args: | |||||
| model_name (str): Name for efficientnet. | |||||
| in_channels (int): Input data's channel number. | |||||
| override_params (other key word params): | |||||
| Params to override model's global_params. | |||||
| Optional key: | |||||
| 'width_coefficient', 'depth_coefficient', | |||||
| 'image_size', 'dropout_rate', | |||||
| 'num_classes', 'batch_norm_momentum', | |||||
| 'batch_norm_epsilon', 'drop_connect_rate', | |||||
| 'depth_divisor', 'min_depth' | |||||
| Returns: | |||||
| An efficientnet model. | |||||
| """ | |||||
| cls._check_model_name_is_valid(model_name) | |||||
| blocks_args, global_params = get_model_params(model_name, | |||||
| override_params) | |||||
| model = cls(blocks_args, global_params) | |||||
| model._change_in_channels(in_channels) | |||||
| return model | |||||
| @classmethod | |||||
| def from_pretrained(cls, | |||||
| model_name, | |||||
| weights_path=None, | |||||
| advprop=False, | |||||
| in_channels=3, | |||||
| num_classes=1000, | |||||
| **override_params): | |||||
| """Create an efficientnet model according to name. | |||||
| Args: | |||||
| model_name (str): Name for efficientnet. | |||||
| weights_path (None or str): | |||||
| str: path to pretrained weights file on the local disk. | |||||
| None: use pretrained weights downloaded from the Internet. | |||||
| advprop (bool): | |||||
| Whether to load pretrained weights | |||||
| trained with advprop (valid when weights_path is None). | |||||
| in_channels (int): Input data's channel number. | |||||
| num_classes (int): | |||||
| Number of categories for classification. | |||||
| It controls the output size for final linear layer. | |||||
| override_params (other key word params): | |||||
| Params to override model's global_params. | |||||
| Optional key: | |||||
| 'width_coefficient', 'depth_coefficient', | |||||
| 'image_size', 'dropout_rate', | |||||
| 'batch_norm_momentum', | |||||
| 'batch_norm_epsilon', 'drop_connect_rate', | |||||
| 'depth_divisor', 'min_depth' | |||||
| Returns: | |||||
| A pretrained efficientnet model. | |||||
| """ | |||||
| model = cls.from_name( | |||||
| model_name, num_classes=num_classes, **override_params) | |||||
| model._change_in_channels(in_channels) | |||||
| return model | |||||
| @classmethod | |||||
| def get_image_size(cls, model_name): | |||||
| """Get the input image size for a given efficientnet model. | |||||
| Args: | |||||
| model_name (str): Name for efficientnet. | |||||
| Returns: | |||||
| Input image size (resolution). | |||||
| """ | |||||
| cls._check_model_name_is_valid(model_name) | |||||
| _, _, res, _ = efficientnet_params(model_name) | |||||
| return res | |||||
| @classmethod | |||||
| def _check_model_name_is_valid(cls, model_name): | |||||
| """Validates model name. | |||||
| Args: | |||||
| model_name (str): Name for efficientnet. | |||||
| Returns: | |||||
| bool: Is a valid name or not. | |||||
| """ | |||||
| if model_name not in VALID_MODELS: | |||||
| raise ValueError('model_name should be one of: ' | |||||
| + ', '.join(VALID_MODELS)) | |||||
| def _change_in_channels(self, in_channels): | |||||
| """Adjust model's first convolution layer to in_channels, if in_channels not equals 3. | |||||
| Args: | |||||
| in_channels (int): Input data's channel number. | |||||
| """ | |||||
| if in_channels != 3: | |||||
| Conv2d = get_same_padding_conv2d( | |||||
| image_size=self._global_params.image_size) | |||||
| out_channels = round_filters(32, self._global_params) | |||||
| self._conv_stem = Conv2d( | |||||
| in_channels, out_channels, kernel_size=3, stride=2, bias=False) | |||||
| @@ -0,0 +1,559 @@ | |||||
| # The implementation here is modified based on EfficientNet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch | |||||
| import collections | |||||
| import math | |||||
| import re | |||||
| from functools import partial | |||||
| import torch | |||||
| from torch import nn | |||||
| from torch.nn import functional as F | |||||
| from torch.utils import model_zoo | |||||
| GlobalParams = collections.namedtuple('GlobalParams', [ | |||||
| 'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate', | |||||
| 'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon', | |||||
| 'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top' | |||||
| ]) | |||||
| BlockArgs = collections.namedtuple('BlockArgs', [ | |||||
| 'num_repeat', 'kernel_size', 'stride', 'expand_ratio', 'input_filters', | |||||
| 'output_filters', 'se_ratio', 'id_skip' | |||||
| ]) | |||||
| GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields) | |||||
| BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields) | |||||
| if hasattr(nn, 'SiLU'): | |||||
| Swish = nn.SiLU | |||||
| else: | |||||
| class Swish(nn.Module): | |||||
| def forward(self, x): | |||||
| return x * torch.sigmoid(x) | |||||
| class SwishImplementation(torch.autograd.Function): | |||||
| @staticmethod | |||||
| def forward(ctx, i): | |||||
| result = i * torch.sigmoid(i) | |||||
| ctx.save_for_backward(i) | |||||
| return result | |||||
| @staticmethod | |||||
| def backward(ctx, grad_output): | |||||
| i = ctx.saved_tensors[0] | |||||
| sigmoid_i = torch.sigmoid(i) | |||||
| return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) | |||||
| class MemoryEfficientSwish(nn.Module): | |||||
| def forward(self, x): | |||||
| return SwishImplementation.apply(x) | |||||
| def round_filters(filters, global_params): | |||||
| """Calculate and round number of filters based on width multiplier. | |||||
| Use width_coefficient, depth_divisor and min_depth of global_params. | |||||
| Args: | |||||
| filters (int): Filters number to be calculated. | |||||
| global_params (namedtuple): Global params of the model. | |||||
| Returns: | |||||
| new_filters: New filters number after calculating. | |||||
| """ | |||||
| multiplier = global_params.width_coefficient | |||||
| if not multiplier: | |||||
| return filters | |||||
| divisor = global_params.depth_divisor | |||||
| min_depth = global_params.min_depth | |||||
| filters *= multiplier | |||||
| min_depth = min_depth or divisor | |||||
| new_filters = max(min_depth, | |||||
| int(filters + divisor / 2) // divisor * divisor) | |||||
| if new_filters < 0.9 * filters: | |||||
| new_filters += divisor | |||||
| return int(new_filters) | |||||
| def round_repeats(repeats, global_params): | |||||
| """Calculate module's repeat number of a block based on depth multiplier. | |||||
| Use depth_coefficient of global_params. | |||||
| Args: | |||||
| repeats (int): num_repeat to be calculated. | |||||
| global_params (namedtuple): Global params of the model. | |||||
| Returns: | |||||
| new repeat: New repeat number after calculating. | |||||
| """ | |||||
| multiplier = global_params.depth_coefficient | |||||
| if not multiplier: | |||||
| return repeats | |||||
| return int(math.ceil(multiplier * repeats)) | |||||
| def drop_connect(inputs, p, training): | |||||
| """Drop connect. | |||||
| Args: | |||||
| input (tensor: BCWH): Input of this structure. | |||||
| p (float: 0.0~1.0): Probability of drop connection. | |||||
| training (bool): The running mode. | |||||
| Returns: | |||||
| output: Output after drop connection. | |||||
| """ | |||||
| assert 0 <= p <= 1, 'p must be in range of [0,1]' | |||||
| if not training: | |||||
| return inputs | |||||
| batch_size = inputs.shape[0] | |||||
| keep_prob = 1 - p | |||||
| random_tensor = keep_prob | |||||
| random_tensor += torch.rand([batch_size, 1, 1, 1], | |||||
| dtype=inputs.dtype, | |||||
| device=inputs.device) | |||||
| binary_tensor = torch.floor(random_tensor) | |||||
| output = inputs / keep_prob * binary_tensor | |||||
| return output | |||||
| def get_width_and_height_from_size(x): | |||||
| """Obtain height and width from x. | |||||
| Args: | |||||
| x (int, tuple or list): Data size. | |||||
| Returns: | |||||
| size: A tuple or list (H,W). | |||||
| """ | |||||
| if isinstance(x, int): | |||||
| return x, x | |||||
| if isinstance(x, list) or isinstance(x, tuple): | |||||
| return x | |||||
| else: | |||||
| raise TypeError() | |||||
| def calculate_output_image_size(input_image_size, stride): | |||||
| """Calculates the output image size when using Conv2dSamePadding with a stride. | |||||
| Necessary for static padding. Thanks to mannatsingh for pointing this out. | |||||
| Args: | |||||
| input_image_size (int, tuple or list): Size of input image. | |||||
| stride (int, tuple or list): Conv2d operation's stride. | |||||
| Returns: | |||||
| output_image_size: A list [H,W]. | |||||
| """ | |||||
| if input_image_size is None: | |||||
| return None | |||||
| image_height, image_width = get_width_and_height_from_size( | |||||
| input_image_size) | |||||
| stride = stride if isinstance(stride, int) else stride[0] | |||||
| image_height = int(math.ceil(image_height / stride)) | |||||
| image_width = int(math.ceil(image_width / stride)) | |||||
| return [image_height, image_width] | |||||
| def get_same_padding_conv2d(image_size=None): | |||||
| """Chooses static padding if you have specified an image size, and dynamic padding otherwise. | |||||
| Static padding is necessary for ONNX exporting of models. | |||||
| Args: | |||||
| image_size (int or tuple): Size of the image. | |||||
| Returns: | |||||
| Conv2dDynamicSamePadding or Conv2dStaticSamePadding. | |||||
| """ | |||||
| if image_size is None: | |||||
| return Conv2dDynamicSamePadding | |||||
| else: | |||||
| return partial(Conv2dStaticSamePadding, image_size=image_size) | |||||
| class Conv2dDynamicSamePadding(nn.Conv2d): | |||||
| """2D Convolutions like TensorFlow, for a dynamic image size. | |||||
| The padding is operated in forward function by calculating dynamically. | |||||
| """ | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| bias=True): | |||||
| super().__init__(in_channels, out_channels, kernel_size, stride, 0, | |||||
| dilation, groups, bias) | |||||
| self.stride = self.stride if len( | |||||
| self.stride) == 2 else [self.stride[0]] * 2 | |||||
| def forward(self, x): | |||||
| ih, iw = x.size()[-2:] | |||||
| kh, kw = self.weight.size()[-2:] | |||||
| sh, sw = self.stride | |||||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||||
| a1 = (oh - 1) * self.stride[0] | |||||
| pad_h = max(a1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||||
| a2 = (ow - 1) * self.stride[1] | |||||
| pad_w = max(a2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||||
| if pad_h > 0 or pad_w > 0: | |||||
| x = F.pad(x, [ | |||||
| pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 | |||||
| ]) | |||||
| return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, | |||||
| self.dilation, self.groups) | |||||
| class Conv2dStaticSamePadding(nn.Conv2d): | |||||
| """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size. | |||||
| The padding mudule is calculated in construction function, then used in forward. | |||||
| """ | |||||
| def __init__(self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| image_size=None, | |||||
| **kwargs): | |||||
| super().__init__(in_channels, out_channels, kernel_size, stride, | |||||
| **kwargs) | |||||
| self.stride = self.stride if len( | |||||
| self.stride) == 2 else [self.stride[0]] * 2 | |||||
| assert image_size is not None | |||||
| ih, iw = (image_size, | |||||
| image_size) if isinstance(image_size, int) else image_size | |||||
| kh, kw = self.weight.size()[-2:] | |||||
| sh, sw = self.stride | |||||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||||
| b1 = (oh - 1) * self.stride[0] | |||||
| pad_h = max(b1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||||
| b2 = (ow - 1) * self.stride[1] | |||||
| pad_w = max(b2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||||
| if pad_h > 0 or pad_w > 0: | |||||
| self.static_padding = nn.ZeroPad2d( | |||||
| (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, | |||||
| pad_h - pad_h // 2)) | |||||
| else: | |||||
| self.static_padding = nn.Identity() | |||||
| def forward(self, x): | |||||
| x = self.static_padding(x) | |||||
| x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, | |||||
| self.dilation, self.groups) | |||||
| return x | |||||
| def get_same_padding_maxPool2d(image_size=None): | |||||
| """Chooses static padding if you have specified an image size, and dynamic padding otherwise. | |||||
| Static padding is necessary for ONNX exporting of models. | |||||
| Args: | |||||
| image_size (int or tuple): Size of the image. | |||||
| Returns: | |||||
| MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding. | |||||
| """ | |||||
| if image_size is None: | |||||
| return MaxPool2dDynamicSamePadding | |||||
| else: | |||||
| return partial(MaxPool2dStaticSamePadding, image_size=image_size) | |||||
| class MaxPool2dDynamicSamePadding(nn.MaxPool2d): | |||||
| """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size. | |||||
| The padding is operated in forward function by calculating dynamically. | |||||
| """ | |||||
| def __init__(self, | |||||
| kernel_size, | |||||
| stride, | |||||
| padding=0, | |||||
| dilation=1, | |||||
| return_indices=False, | |||||
| ceil_mode=False): | |||||
| super().__init__(kernel_size, stride, padding, dilation, | |||||
| return_indices, ceil_mode) | |||||
| self.stride = [self.stride] * 2 if isinstance(self.stride, | |||||
| int) else self.stride | |||||
| self.kernel_size = [self.kernel_size] * 2 if isinstance( | |||||
| self.kernel_size, int) else self.kernel_size | |||||
| self.dilation = [self.dilation] * 2 if isinstance( | |||||
| self.dilation, int) else self.dilation | |||||
| def forward(self, x): | |||||
| ih, iw = x.size()[-2:] | |||||
| kh, kw = self.kernel_size | |||||
| sh, sw = self.stride | |||||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||||
| c1 = (oh - 1) * self.stride[0] | |||||
| pad_h = max(c1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||||
| c2 = (ow - 1) * self.stride[1] | |||||
| pad_w = max(c2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||||
| if pad_h > 0 or pad_w > 0: | |||||
| x = F.pad(x, [ | |||||
| pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 | |||||
| ]) | |||||
| return F.max_pool2d(x, self.kernel_size, self.stride, self.padding, | |||||
| self.dilation, self.ceil_mode, self.return_indices) | |||||
| class MaxPool2dStaticSamePadding(nn.MaxPool2d): | |||||
| """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size. | |||||
| The padding mudule is calculated in construction function, then used in forward. | |||||
| """ | |||||
| def __init__(self, kernel_size, stride, image_size=None, **kwargs): | |||||
| super().__init__(kernel_size, stride, **kwargs) | |||||
| self.stride = [self.stride] * 2 if isinstance(self.stride, | |||||
| int) else self.stride | |||||
| self.kernel_size = [self.kernel_size] * 2 if isinstance( | |||||
| self.kernel_size, int) else self.kernel_size | |||||
| self.dilation = [self.dilation] * 2 if isinstance( | |||||
| self.dilation, int) else self.dilation | |||||
| assert image_size is not None | |||||
| ih, iw = (image_size, | |||||
| image_size) if isinstance(image_size, int) else image_size | |||||
| kh, kw = self.kernel_size | |||||
| sh, sw = self.stride | |||||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||||
| d1 = (oh - 1) * self.stride[0] | |||||
| pad_h = max(d1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||||
| d2 = (ow - 1) * self.stride[1] | |||||
| pad_w = max(d2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||||
| if pad_h > 0 or pad_w > 0: | |||||
| self.static_padding = nn.ZeroPad2d( | |||||
| (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, | |||||
| pad_h - pad_h // 2)) | |||||
| else: | |||||
| self.static_padding = nn.Identity() | |||||
| def forward(self, x): | |||||
| x = self.static_padding(x) | |||||
| x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding, | |||||
| self.dilation, self.ceil_mode, self.return_indices) | |||||
| return x | |||||
| class BlockDecoder(object): | |||||
| """Block Decoder for readability, | |||||
| straight from the official TensorFlow repository. | |||||
| """ | |||||
| @staticmethod | |||||
| def _decode_block_string(block_string): | |||||
| """Get a block through a string notation of arguments. | |||||
| Args: | |||||
| block_string (str): A string notation of arguments. | |||||
| Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'. | |||||
| Returns: | |||||
| BlockArgs: The namedtuple defined at the top of this file. | |||||
| """ | |||||
| assert isinstance(block_string, str) | |||||
| ops = block_string.split('_') | |||||
| options = {} | |||||
| for op in ops: | |||||
| splits = re.split(r'(\d.*)', op) | |||||
| if len(splits) >= 2: | |||||
| key, value = splits[:2] | |||||
| options[key] = value | |||||
| # Check stride | |||||
| assert (('s' in options and len(options['s']) == 1) | |||||
| or (len(options['s']) == 2 | |||||
| and options['s'][0] == options['s'][1])) | |||||
| return BlockArgs( | |||||
| num_repeat=int(options['r']), | |||||
| kernel_size=int(options['k']), | |||||
| stride=[int(options['s'][0])], | |||||
| expand_ratio=int(options['e']), | |||||
| input_filters=int(options['i']), | |||||
| output_filters=int(options['o']), | |||||
| se_ratio=float(options['se']) if 'se' in options else None, | |||||
| id_skip=('noskip' not in block_string)) | |||||
| @staticmethod | |||||
| def _encode_block_string(block): | |||||
| """Encode a block to a string. | |||||
| Args: | |||||
| block (namedtuple): A BlockArgs type argument. | |||||
| Returns: | |||||
| block_string: A String form of BlockArgs. | |||||
| """ | |||||
| args = [ | |||||
| 'r%d' % block.num_repeat, | |||||
| 'k%d' % block.kernel_size, | |||||
| 's%d%d' % (block.strides[0], block.strides[1]), | |||||
| 'e%s' % block.expand_ratio, | |||||
| 'i%d' % block.input_filters, | |||||
| 'o%d' % block.output_filters | |||||
| ] | |||||
| if 0 < block.se_ratio <= 1: | |||||
| args.append('se%s' % block.se_ratio) | |||||
| if block.id_skip is False: | |||||
| args.append('noskip') | |||||
| return '_'.join(args) | |||||
| @staticmethod | |||||
| def decode(string_list): | |||||
| """Decode a list of string notations to specify blocks inside the network. | |||||
| Args: | |||||
| string_list (list[str]): A list of strings, each string is a notation of block. | |||||
| Returns: | |||||
| blocks_args: A list of BlockArgs namedtuples of block args. | |||||
| """ | |||||
| assert isinstance(string_list, list) | |||||
| blocks_args = [] | |||||
| for block_string in string_list: | |||||
| blocks_args.append(BlockDecoder._decode_block_string(block_string)) | |||||
| return blocks_args | |||||
| @staticmethod | |||||
| def encode(blocks_args): | |||||
| """Encode a list of BlockArgs to a list of strings. | |||||
| Args: | |||||
| blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args. | |||||
| Returns: | |||||
| block_strings: A list of strings, each string is a notation of block. | |||||
| """ | |||||
| block_strings = [] | |||||
| for block in blocks_args: | |||||
| block_strings.append(BlockDecoder._encode_block_string(block)) | |||||
| return block_strings | |||||
| def efficientnet_params(model_name): | |||||
| """Map EfficientNet model name to parameter coefficients. | |||||
| Args: | |||||
| model_name (str): Model name to be queried. | |||||
| Returns: | |||||
| params_dict[model_name]: A (width,depth,res,dropout) tuple. | |||||
| """ | |||||
| params_dict = { | |||||
| 'efficientnet-b0': (1.0, 1.0, 112, 0.2), | |||||
| 'efficientnet-b1': (1.0, 1.1, 240, 0.2), | |||||
| 'efficientnet-b2': (1.1, 1.2, 260, 0.3), | |||||
| 'efficientnet-b3': (1.2, 1.4, 300, 0.3), | |||||
| 'efficientnet-b4': (1.4, 1.8, 380, 0.4), | |||||
| 'efficientnet-b5': (1.6, 2.2, 456, 0.4), | |||||
| 'efficientnet-b6': (1.8, 2.6, 528, 0.5), | |||||
| 'efficientnet-b7': (2.0, 3.1, 600, 0.5), | |||||
| 'efficientnet-b8': (2.2, 3.6, 672, 0.5), | |||||
| 'efficientnet-l2': (4.3, 5.3, 800, 0.5), | |||||
| } | |||||
| return params_dict[model_name] | |||||
| def efficientnet(width_coefficient=None, | |||||
| depth_coefficient=None, | |||||
| image_size=None, | |||||
| dropout_rate=0.2, | |||||
| drop_connect_rate=0.2, | |||||
| num_classes=1000, | |||||
| include_top=True): | |||||
| """Create BlockArgs and GlobalParams for efficientnet model. | |||||
| Args: | |||||
| width_coefficient (float) | |||||
| depth_coefficient (float) | |||||
| image_size (int) | |||||
| dropout_rate (float) | |||||
| drop_connect_rate (float) | |||||
| num_classes (int) | |||||
| Meaning as the name suggests. | |||||
| Returns: | |||||
| blocks_args, global_params. | |||||
| """ | |||||
| blocks_args = [ | |||||
| 'r1_k3_s11_e1_i32_o16_se0.25', | |||||
| 'r2_k3_s22_e6_i16_o24_se0.25', | |||||
| 'r2_k5_s22_e6_i24_o40_se0.25', | |||||
| 'r3_k3_s22_e6_i40_o80_se0.25', | |||||
| 'r3_k5_s11_e6_i80_o112_se0.25', | |||||
| 'r4_k5_s22_e6_i112_o192_se0.25', | |||||
| 'r1_k3_s11_e6_i192_o320_se0.25', | |||||
| ] | |||||
| blocks_args = BlockDecoder.decode(blocks_args) | |||||
| global_params = GlobalParams( | |||||
| width_coefficient=width_coefficient, | |||||
| depth_coefficient=depth_coefficient, | |||||
| image_size=image_size, | |||||
| dropout_rate=dropout_rate, | |||||
| num_classes=num_classes, | |||||
| batch_norm_momentum=0.99, | |||||
| batch_norm_epsilon=1e-3, | |||||
| drop_connect_rate=drop_connect_rate, | |||||
| depth_divisor=8, | |||||
| min_depth=None, | |||||
| include_top=include_top, | |||||
| ) | |||||
| return blocks_args, global_params | |||||
| def get_model_params(model_name, override_params): | |||||
| """Get the block args and global params for a given model name. | |||||
| Args: | |||||
| model_name (str): Model's name. | |||||
| override_params (dict): A dict to modify global_params. | |||||
| Returns: | |||||
| blocks_args, global_params | |||||
| """ | |||||
| if model_name.startswith('efficientnet'): | |||||
| w, d, s, p = efficientnet_params(model_name) | |||||
| blocks_args, global_params = efficientnet( | |||||
| width_coefficient=w, | |||||
| depth_coefficient=d, | |||||
| dropout_rate=p, | |||||
| image_size=s) | |||||
| else: | |||||
| raise NotImplementedError( | |||||
| 'model name is not pre-defined: {}'.format(model_name)) | |||||
| if override_params: | |||||
| global_params = global_params._replace(**override_params) | |||||
| return blocks_args, global_params | |||||
| def load_pretrained_weights(model, | |||||
| model_name, | |||||
| weights_path=None, | |||||
| load_fc=True, | |||||
| advprop=False, | |||||
| verbose=True): | |||||
| """Loads pretrained weights from weights path or download using url. | |||||
| Args: | |||||
| model (Module): The whole model of efficientnet. | |||||
| model_name (str): Model name of efficientnet. | |||||
| weights_path (None or str): | |||||
| str: path to pretrained weights file on the local disk. | |||||
| None: use pretrained weights downloaded from the Internet. | |||||
| load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model. | |||||
| advprop (bool): Whether to load pretrained weights | |||||
| trained with advprop (valid when weights_path is None). | |||||
| """ | |||||
| if isinstance(weights_path, str): | |||||
| state_dict = torch.load(weights_path) | |||||
| else: | |||||
| url_map_ = url_map_advprop if advprop else url_map | |||||
| state_dict = model_zoo.load_url(url_map_[model_name]) | |||||
| if load_fc: | |||||
| ret = model.load_state_dict(state_dict, strict=False) | |||||
| assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format( | |||||
| ret.missing_keys) | |||||
| else: | |||||
| state_dict.pop('_fc.weight') | |||||
| state_dict.pop('_fc.bias') | |||||
| ret = model.load_state_dict(state_dict, strict=False) | |||||
| assert set(ret.missing_keys) == set([ | |||||
| '_fc.weight', '_fc.bias' | |||||
| ]), 'Missing keys when loading pretrained weights: {}'.format( | |||||
| ret.missing_keys) | |||||
| assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format( | |||||
| ret.unexpected_keys) | |||||
| if verbose: | |||||
| print('Loaded pretrained weights for {}'.format(model_name)) | |||||
| @@ -0,0 +1,67 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import torch | |||||
| from PIL import Image | |||||
| from torch import nn | |||||
| from torchvision import transforms | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .face_alignment.face_align import face_detection_PIL_v2 | |||||
| logger = get_logger() | |||||
| def transform_PIL(img_pil): | |||||
| val_transforms = transforms.Compose([ | |||||
| transforms.ToTensor(), | |||||
| transforms.Normalize( | |||||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||||
| ]) | |||||
| return val_transforms(img_pil) | |||||
| index2AU = [1, 2, 4, 6, 7, 10, 12, 15, 23, 24, 25, 26] | |||||
| emotion_list = [ | |||||
| 'Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise' | |||||
| ] | |||||
| def inference(image_path, model, face_model, score_thre=0.5, GPU=0): | |||||
| image = Image.open(image_path).convert('RGB') | |||||
| face, bbox = face_detection_PIL_v2(image, face_model) | |||||
| if bbox is None: | |||||
| logger.warn('no face detected!') | |||||
| result = {'emotion_result': None, 'box': None} | |||||
| return result | |||||
| face = transform_PIL(face) | |||||
| face = face.unsqueeze(0) | |||||
| if torch.cuda.is_available(): | |||||
| face = face.cuda(GPU) | |||||
| logits_AU, logits_emotion = model(face) | |||||
| logits_AU = torch.sigmoid(logits_AU) | |||||
| logits_emotion = nn.functional.softmax(logits_emotion, 1) | |||||
| _, index_list = logits_emotion.max(1) | |||||
| emotion_index = index_list[0].data.item() | |||||
| prob = logits_emotion[0][emotion_index] | |||||
| if prob > score_thre and emotion_index != 3: | |||||
| cur_emotion = emotion_list[emotion_index] | |||||
| else: | |||||
| cur_emotion = 'Neutral' | |||||
| logits_AU = logits_AU[0] | |||||
| au_ouput = torch.zeros_like(logits_AU) | |||||
| au_ouput[logits_AU >= score_thre] = 1 | |||||
| au_ouput[logits_AU < score_thre] = 0 | |||||
| au_ouput = au_ouput.int() | |||||
| cur_au_list = [] | |||||
| for idx in range(au_ouput.shape[0]): | |||||
| if au_ouput[idx] == 1: | |||||
| au = index2AU[idx] | |||||
| cur_au_list.append(au) | |||||
| cur_au_list.sort() | |||||
| result = (cur_emotion, bbox) | |||||
| return result | |||||
| @@ -0,0 +1,96 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import os | |||||
| import sys | |||||
| import torch | |||||
| import torch.nn.functional as F | |||||
| from torch import nn | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.models.cv.face_emotion.efficient import EfficientNet | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| logger = get_logger() | |||||
| @MODELS.register_module(Tasks.face_emotion, module_name=Models.face_emotion) | |||||
| class EfficientNetForFaceEmotion(TorchModel): | |||||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||||
| super().__init__( | |||||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||||
| self.model = FaceEmotionModel( | |||||
| name='efficientnet-b0', num_embed=512, num_au=12, num_emotion=7) | |||||
| if torch.cuda.is_available(): | |||||
| self.device = 'cuda' | |||||
| logger.info('Use GPU') | |||||
| else: | |||||
| self.device = 'cpu' | |||||
| logger.info('Use CPU') | |||||
| pretrained_params = torch.load( | |||||
| '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), | |||||
| map_location=self.device) | |||||
| state_dict = pretrained_params['model'] | |||||
| new_state = {} | |||||
| for k, v in state_dict.items(): | |||||
| if k.startswith('module.'): | |||||
| k = k[7:] | |||||
| new_state[k] = v | |||||
| self.model.load_state_dict(new_state) | |||||
| self.model.eval() | |||||
| self.model.to(self.device) | |||||
| def forward(self, x): | |||||
| logits_au, logits_emotion = self.model(x) | |||||
| return logits_au, logits_emotion | |||||
| class FaceEmotionModel(nn.Module): | |||||
| def __init__(self, | |||||
| name='efficientnet-b0', | |||||
| num_embed=512, | |||||
| num_au=12, | |||||
| num_emotion=7): | |||||
| super(FaceEmotionModel, self).__init__() | |||||
| self.backbone = EfficientNet.from_pretrained( | |||||
| name, weights_path=None, advprop=True) | |||||
| self.average_pool = nn.AdaptiveAvgPool2d(1) | |||||
| self.embed = nn.Linear(self.backbone._fc.weight.data.shape[1], | |||||
| num_embed) | |||||
| self.features = nn.BatchNorm1d(num_embed) | |||||
| nn.init.constant_(self.features.weight, 1.0) | |||||
| self.features.weight.requires_grad = False | |||||
| self.fc_au = nn.Sequential( | |||||
| nn.Dropout(0.6), | |||||
| nn.Linear(num_embed, num_au), | |||||
| ) | |||||
| self.fc_emotion = nn.Sequential( | |||||
| nn.Dropout(0.6), | |||||
| nn.Linear(num_embed, num_emotion), | |||||
| ) | |||||
| def feat_single_img(self, x): | |||||
| x = self.backbone.extract_features(x) | |||||
| x = self.average_pool(x) | |||||
| x = x.flatten(1) | |||||
| x = self.embed(x) | |||||
| x = self.features(x) | |||||
| return x | |||||
| def forward(self, x): | |||||
| x = self.feat_single_img(x) | |||||
| logits_au = self.fc_au(x) | |||||
| att_au = torch.sigmoid(logits_au).unsqueeze(-1) | |||||
| x = x.unsqueeze(1) | |||||
| emotion_vec_list = torch.matmul(att_au, x) | |||||
| emotion_vec = emotion_vec_list.sum(1) | |||||
| logits_emotion = self.fc_emotion(emotion_vec) | |||||
| return logits_au, logits_emotion | |||||
| @@ -0,0 +1,79 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import os | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import tensorflow as tf | |||||
| def init(mod): | |||||
| PATH_TO_CKPT = mod | |||||
| net = tf.Graph() | |||||
| with net.as_default(): | |||||
| od_graph_def = tf.GraphDef() | |||||
| config = tf.ConfigProto() | |||||
| config.gpu_options.per_process_gpu_memory_fraction = 0.6 | |||||
| with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: | |||||
| serialized_graph = fid.read() | |||||
| od_graph_def.ParseFromString(serialized_graph) | |||||
| tf.import_graph_def(od_graph_def, name='') | |||||
| sess = tf.Session(graph=net, config=config) | |||||
| return sess, net | |||||
| def filter_bboxes_confs(shape, | |||||
| imgsBboxes, | |||||
| imgsConfs, | |||||
| single=False, | |||||
| thresh=0.5): | |||||
| [w, h] = shape | |||||
| if single: | |||||
| bboxes, confs = [], [] | |||||
| for y in range(len(imgsBboxes)): | |||||
| if imgsConfs[y] >= thresh: | |||||
| [x1, y1, x2, y2] = list(imgsBboxes[y]) | |||||
| x1, y1, x2, y2 = int(w * x1), int(h * y1), int(w * x2), int( | |||||
| h * y2) | |||||
| bboxes.append([y1, x1, y2, x2]) | |||||
| confs.append(imgsConfs[y]) | |||||
| return bboxes, confs | |||||
| else: | |||||
| retImgsBboxes, retImgsConfs = [], [] | |||||
| for x in range(len(imgsBboxes)): | |||||
| bboxes, confs = [], [] | |||||
| for y in range(len(imgsBboxes[x])): | |||||
| if imgsConfs[x][y] >= thresh: | |||||
| [x1, y1, x2, y2] = list(imgsBboxes[x][y]) | |||||
| x1, y1, x2, y2 = int(w * x1), int(h * y1), int( | |||||
| w * x2), int(h * y2) | |||||
| bboxes.append([y1, x1, y2, x2]) | |||||
| confs.append(imgsConfs[x][y]) | |||||
| retImgsBboxes.append(bboxes) | |||||
| retImgsConfs.append(confs) | |||||
| return retImgsBboxes, retImgsConfs | |||||
| def detect(im, sess, net): | |||||
| image_np = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) | |||||
| image_np_expanded = np.expand_dims(image_np, axis=0) | |||||
| image_tensor = net.get_tensor_by_name('image_tensor:0') | |||||
| bboxes = net.get_tensor_by_name('detection_boxes:0') | |||||
| dConfs = net.get_tensor_by_name('detection_scores:0') | |||||
| classes = net.get_tensor_by_name('detection_classes:0') | |||||
| num_detections = net.get_tensor_by_name('num_detections:0') | |||||
| (bboxes, dConfs, classes, | |||||
| num_detections) = sess.run([bboxes, dConfs, classes, num_detections], | |||||
| feed_dict={image_tensor: image_np_expanded}) | |||||
| w, h, _ = im.shape | |||||
| bboxes, confs = filter_bboxes_confs([w, h], bboxes[0], dConfs[0], True) | |||||
| return bboxes, confs | |||||
| class FaceDetector: | |||||
| def __init__(self, mod): | |||||
| self.sess, self.net = init(mod) | |||||
| def do_detect(self, im): | |||||
| bboxes, confs = detect(im, self.sess, self.net) | |||||
| return bboxes, confs | |||||
| @@ -0,0 +1,59 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import os | |||||
| import sys | |||||
| import cv2 | |||||
| import numpy as np | |||||
| from PIL import Image, ImageFile | |||||
| from .face import FaceDetector | |||||
| ImageFile.LOAD_TRUNCATED_IMAGES = True | |||||
| def adjust_bx_v2(box, w, h): | |||||
| x1, y1, x2, y2 = box[0], box[1], box[2], box[3] | |||||
| box_w = x2 - x1 | |||||
| box_h = y2 - y1 | |||||
| delta = abs(box_w - box_h) | |||||
| if box_w > box_h: | |||||
| if y1 >= delta: | |||||
| y1 = y1 - delta | |||||
| else: | |||||
| delta_y1 = y1 | |||||
| y1 = 0 | |||||
| delta_y2 = delta - delta_y1 | |||||
| y2 = y2 + delta_y2 if y2 < h - delta_y2 else h - 1 | |||||
| else: | |||||
| if x1 >= delta / 2 and x2 <= w - delta / 2: | |||||
| x1 = x1 - delta / 2 | |||||
| x2 = x2 + delta / 2 | |||||
| elif x1 < delta / 2 and x2 <= w - delta / 2: | |||||
| delta_x1 = x1 | |||||
| x1 = 0 | |||||
| delta_x2 = delta - delta_x1 | |||||
| x2 = x2 + delta_x2 if x2 < w - delta_x2 else w - 1 | |||||
| elif x1 >= delta / 2 and x2 > w - delta / 2: | |||||
| delta_x2 = w - x2 | |||||
| x2 = w - 1 | |||||
| delta_x1 = delta - x1 | |||||
| x1 = x1 - delta_x1 if x1 >= delta_x1 else 0 | |||||
| x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) | |||||
| return [x1, y1, x2, y2] | |||||
| def face_detection_PIL_v2(image, face_model): | |||||
| crop_size = 112 | |||||
| face_detector = FaceDetector(face_model) | |||||
| img = np.array(image) | |||||
| h, w = img.shape[0:2] | |||||
| bxs, conf = face_detector.do_detect(img) | |||||
| bx = bxs[0] | |||||
| bx = adjust_bx_v2(bx, w, h) | |||||
| x1, y1, x2, y2 = bx | |||||
| image = img[y1:y2, x1:x2, :] | |||||
| img = Image.fromarray(image) | |||||
| img = img.resize((crop_size, crop_size)) | |||||
| bx = tuple(bx) | |||||
| return img, bx | |||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License | |||||
| # at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/conv2d_gradfix.py | |||||
| import contextlib | import contextlib | ||||
| import warnings | import warnings | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License | |||||
| # t https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py | |||||
| import os | import os | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License | |||||
| # at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py | |||||
| import os | import os | ||||
| from collections import abc | from collections import abc | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from stylegan2-pytorch, | |||||
| # made public available under the MIT License at https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py | |||||
| import functools | import functools | ||||
| import math | import math | ||||
| import operator | import operator | ||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .det_infer import NanoDetForFaceHumanHandDetection | |||||
| else: | |||||
| _import_structure = {'det_infer': ['NanoDetForFaceHumanHandDetection']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,133 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .one_stage_detector import OneStageDetector | |||||
| logger = get_logger() | |||||
| def load_model_weight(model_dir, device): | |||||
| checkpoint = torch.load( | |||||
| '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), | |||||
| map_location=device) | |||||
| state_dict = checkpoint['state_dict'].copy() | |||||
| for k in checkpoint['state_dict']: | |||||
| if k.startswith('avg_model.'): | |||||
| v = state_dict.pop(k) | |||||
| state_dict[k[4:]] = v | |||||
| return state_dict | |||||
| @MODELS.register_module( | |||||
| Tasks.face_human_hand_detection, | |||||
| module_name=Models.face_human_hand_detection) | |||||
| class NanoDetForFaceHumanHandDetection(TorchModel): | |||||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||||
| super().__init__( | |||||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||||
| self.model = OneStageDetector() | |||||
| if torch.cuda.is_available(): | |||||
| self.device = 'cuda' | |||||
| logger.info('Use GPU ') | |||||
| else: | |||||
| self.device = 'cpu' | |||||
| logger.info('Use CPU') | |||||
| self.state_dict = load_model_weight(model_dir, self.device) | |||||
| self.model.load_state_dict(self.state_dict, strict=False) | |||||
| self.model.eval() | |||||
| self.model.to(self.device) | |||||
| def forward(self, x): | |||||
| pred_result = self.model.inference(x) | |||||
| return pred_result | |||||
| def naive_collate(batch): | |||||
| elem = batch[0] | |||||
| if isinstance(elem, dict): | |||||
| return {key: naive_collate([d[key] for d in batch]) for key in elem} | |||||
| else: | |||||
| return batch | |||||
| def get_resize_matrix(raw_shape, dst_shape): | |||||
| r_w, r_h = raw_shape | |||||
| d_w, d_h = dst_shape | |||||
| Rs = np.eye(3) | |||||
| Rs[0, 0] *= d_w / r_w | |||||
| Rs[1, 1] *= d_h / r_h | |||||
| return Rs | |||||
| def color_aug_and_norm(meta, mean, std): | |||||
| img = meta['img'].astype(np.float32) / 255 | |||||
| mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3) / 255 | |||||
| std = np.array(std, dtype=np.float32).reshape(1, 1, 3) / 255 | |||||
| img = (img - mean) / std | |||||
| meta['img'] = img | |||||
| return meta | |||||
| def img_process(meta, mean, std): | |||||
| raw_img = meta['img'] | |||||
| height = raw_img.shape[0] | |||||
| width = raw_img.shape[1] | |||||
| dst_shape = [320, 320] | |||||
| M = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) | |||||
| ResizeM = get_resize_matrix((width, height), dst_shape) | |||||
| M = ResizeM @ M | |||||
| img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape)) | |||||
| meta['img'] = img | |||||
| meta['warp_matrix'] = M | |||||
| meta = color_aug_and_norm(meta, mean, std) | |||||
| return meta | |||||
| def overlay_bbox_cv(dets, class_names, score_thresh): | |||||
| all_box = [] | |||||
| for label in dets: | |||||
| for bbox in dets[label]: | |||||
| score = bbox[-1] | |||||
| if score > score_thresh: | |||||
| x0, y0, x1, y1 = [int(i) for i in bbox[:4]] | |||||
| all_box.append([label, x0, y0, x1, y1, score]) | |||||
| all_box.sort(key=lambda v: v[5]) | |||||
| return all_box | |||||
| mean = [103.53, 116.28, 123.675] | |||||
| std = [57.375, 57.12, 58.395] | |||||
| class_names = ['person', 'face', 'hand'] | |||||
| def inference(model, device, img_path): | |||||
| img_info = {'id': 0} | |||||
| img = cv2.imread(img_path) | |||||
| height, width = img.shape[:2] | |||||
| img_info['height'] = height | |||||
| img_info['width'] = width | |||||
| meta = dict(img_info=img_info, raw_img=img, img=img) | |||||
| meta = img_process(meta, mean, std) | |||||
| meta['img'] = torch.from_numpy(meta['img'].transpose(2, 0, 1)).to(device) | |||||
| meta = naive_collate([meta]) | |||||
| meta['img'] = (meta['img'][0]).reshape(1, 3, 320, 320) | |||||
| with torch.no_grad(): | |||||
| res = model(meta) | |||||
| result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35) | |||||
| return result | |||||
| @@ -0,0 +1,395 @@ | |||||
| # The implementation here is modified based on nanodet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||||
| import math | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from .utils import ConvModule, DepthwiseConvModule, act_layers | |||||
| def _make_divisible(v, divisor, min_value=None): | |||||
| if min_value is None: | |||||
| min_value = divisor | |||||
| new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) | |||||
| # Make sure that round down does not go down by more than 10%. | |||||
| if new_v < 0.9 * v: | |||||
| new_v += divisor | |||||
| return new_v | |||||
| def hard_sigmoid(x, inplace: bool = False): | |||||
| if inplace: | |||||
| return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0) | |||||
| else: | |||||
| return F.relu6(x + 3.0) / 6.0 | |||||
| class SqueezeExcite(nn.Module): | |||||
| def __init__(self, | |||||
| in_chs, | |||||
| se_ratio=0.25, | |||||
| reduced_base_chs=None, | |||||
| activation='ReLU', | |||||
| gate_fn=hard_sigmoid, | |||||
| divisor=4, | |||||
| **_): | |||||
| super(SqueezeExcite, self).__init__() | |||||
| self.gate_fn = gate_fn | |||||
| reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, | |||||
| divisor) | |||||
| self.avg_pool = nn.AdaptiveAvgPool2d(1) | |||||
| self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) | |||||
| self.act1 = act_layers(activation) | |||||
| self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True) | |||||
| def forward(self, x): | |||||
| x_se = self.avg_pool(x) | |||||
| x_se = self.conv_reduce(x_se) | |||||
| x_se = self.act1(x_se) | |||||
| x_se = self.conv_expand(x_se) | |||||
| x = x * self.gate_fn(x_se) | |||||
| return x | |||||
| class GhostModule(nn.Module): | |||||
| def __init__(self, | |||||
| inp, | |||||
| oup, | |||||
| kernel_size=1, | |||||
| ratio=2, | |||||
| dw_size=3, | |||||
| stride=1, | |||||
| activation='ReLU'): | |||||
| super(GhostModule, self).__init__() | |||||
| self.oup = oup | |||||
| init_channels = math.ceil(oup / ratio) | |||||
| new_channels = init_channels * (ratio - 1) | |||||
| self.primary_conv = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| inp, | |||||
| init_channels, | |||||
| kernel_size, | |||||
| stride, | |||||
| kernel_size // 2, | |||||
| bias=False), | |||||
| nn.BatchNorm2d(init_channels), | |||||
| act_layers(activation) if activation else nn.Sequential(), | |||||
| ) | |||||
| self.cheap_operation = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| init_channels, | |||||
| new_channels, | |||||
| dw_size, | |||||
| 1, | |||||
| dw_size // 2, | |||||
| groups=init_channels, | |||||
| bias=False, | |||||
| ), | |||||
| nn.BatchNorm2d(new_channels), | |||||
| act_layers(activation) if activation else nn.Sequential(), | |||||
| ) | |||||
| def forward(self, x): | |||||
| x1 = self.primary_conv(x) | |||||
| x2 = self.cheap_operation(x1) | |||||
| out = torch.cat([x1, x2], dim=1) | |||||
| return out | |||||
| class GhostBottleneck(nn.Module): | |||||
| """Ghost bottleneck w/ optional SE""" | |||||
| def __init__( | |||||
| self, | |||||
| in_chs, | |||||
| mid_chs, | |||||
| out_chs, | |||||
| dw_kernel_size=3, | |||||
| stride=1, | |||||
| activation='ReLU', | |||||
| se_ratio=0.0, | |||||
| ): | |||||
| super(GhostBottleneck, self).__init__() | |||||
| has_se = se_ratio is not None and se_ratio > 0.0 | |||||
| self.stride = stride | |||||
| # Point-wise expansion | |||||
| self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation) | |||||
| # Depth-wise convolution | |||||
| if self.stride > 1: | |||||
| self.conv_dw = nn.Conv2d( | |||||
| mid_chs, | |||||
| mid_chs, | |||||
| dw_kernel_size, | |||||
| stride=stride, | |||||
| padding=(dw_kernel_size - 1) // 2, | |||||
| groups=mid_chs, | |||||
| bias=False, | |||||
| ) | |||||
| self.bn_dw = nn.BatchNorm2d(mid_chs) | |||||
| if has_se: | |||||
| self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio) | |||||
| else: | |||||
| self.se = None | |||||
| self.ghost2 = GhostModule(mid_chs, out_chs, activation=None) | |||||
| if in_chs == out_chs and self.stride == 1: | |||||
| self.shortcut = nn.Sequential() | |||||
| else: | |||||
| self.shortcut = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| in_chs, | |||||
| in_chs, | |||||
| dw_kernel_size, | |||||
| stride=stride, | |||||
| padding=(dw_kernel_size - 1) // 2, | |||||
| groups=in_chs, | |||||
| bias=False, | |||||
| ), | |||||
| nn.BatchNorm2d(in_chs), | |||||
| nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False), | |||||
| nn.BatchNorm2d(out_chs), | |||||
| ) | |||||
| def forward(self, x): | |||||
| residual = x | |||||
| x = self.ghost1(x) | |||||
| if self.stride > 1: | |||||
| x = self.conv_dw(x) | |||||
| x = self.bn_dw(x) | |||||
| if self.se is not None: | |||||
| x = self.se(x) | |||||
| x = self.ghost2(x) | |||||
| x += self.shortcut(residual) | |||||
| return x | |||||
| class GhostBlocks(nn.Module): | |||||
| """Stack of GhostBottleneck used in GhostPAN. | |||||
| Args: | |||||
| in_channels (int): Number of input channels. | |||||
| out_channels (int): Number of output channels. | |||||
| expand (int): Expand ratio of GhostBottleneck. Default: 1. | |||||
| kernel_size (int): Kernel size of depthwise convolution. Default: 5. | |||||
| num_blocks (int): Number of GhostBottlecneck blocks. Default: 1. | |||||
| use_res (bool): Whether to use residual connection. Default: False. | |||||
| activation (str): Name of activation function. Default: LeakyReLU. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| expand=1, | |||||
| kernel_size=5, | |||||
| num_blocks=1, | |||||
| use_res=False, | |||||
| activation='LeakyReLU', | |||||
| ): | |||||
| super(GhostBlocks, self).__init__() | |||||
| self.use_res = use_res | |||||
| if use_res: | |||||
| self.reduce_conv = ConvModule( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| activation=activation, | |||||
| ) | |||||
| blocks = [] | |||||
| for _ in range(num_blocks): | |||||
| blocks.append( | |||||
| GhostBottleneck( | |||||
| in_channels, | |||||
| int(out_channels * expand), | |||||
| out_channels, | |||||
| dw_kernel_size=kernel_size, | |||||
| activation=activation, | |||||
| )) | |||||
| self.blocks = nn.Sequential(*blocks) | |||||
| def forward(self, x): | |||||
| out = self.blocks(x) | |||||
| if self.use_res: | |||||
| out = out + self.reduce_conv(x) | |||||
| return out | |||||
| class GhostPAN(nn.Module): | |||||
| """Path Aggregation Network with Ghost block. | |||||
| Args: | |||||
| in_channels (List[int]): Number of input channels per scale. | |||||
| out_channels (int): Number of output channels (used at each scale) | |||||
| num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3 | |||||
| use_depthwise (bool): Whether to depthwise separable convolution in | |||||
| blocks. Default: False | |||||
| kernel_size (int): Kernel size of depthwise convolution. Default: 5. | |||||
| expand (int): Expand ratio of GhostBottleneck. Default: 1. | |||||
| num_blocks (int): Number of GhostBottlecneck blocks. Default: 1. | |||||
| use_res (bool): Whether to use residual connection. Default: False. | |||||
| num_extra_level (int): Number of extra conv layers for more feature levels. | |||||
| Default: 0. | |||||
| upsample_cfg (dict): Config dict for interpolate layer. | |||||
| Default: `dict(scale_factor=2, mode='nearest')` | |||||
| norm_cfg (dict): Config dict for normalization layer. | |||||
| Default: dict(type='BN') | |||||
| activation (str): Activation layer name. | |||||
| Default: LeakyReLU. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| use_depthwise=False, | |||||
| kernel_size=5, | |||||
| expand=1, | |||||
| num_blocks=1, | |||||
| use_res=False, | |||||
| num_extra_level=0, | |||||
| upsample_cfg=dict(scale_factor=2, mode='bilinear'), | |||||
| norm_cfg=dict(type='BN'), | |||||
| activation='LeakyReLU', | |||||
| ): | |||||
| super(GhostPAN, self).__init__() | |||||
| assert num_extra_level >= 0 | |||||
| assert num_blocks >= 1 | |||||
| self.in_channels = in_channels | |||||
| self.out_channels = out_channels | |||||
| conv = DepthwiseConvModule if use_depthwise else ConvModule | |||||
| # build top-down blocks | |||||
| self.upsample = nn.Upsample(**upsample_cfg) | |||||
| self.reduce_layers = nn.ModuleList() | |||||
| for idx in range(len(in_channels)): | |||||
| self.reduce_layers.append( | |||||
| ConvModule( | |||||
| in_channels[idx], | |||||
| out_channels, | |||||
| 1, | |||||
| norm_cfg=norm_cfg, | |||||
| activation=activation, | |||||
| )) | |||||
| self.top_down_blocks = nn.ModuleList() | |||||
| for idx in range(len(in_channels) - 1, 0, -1): | |||||
| self.top_down_blocks.append( | |||||
| GhostBlocks( | |||||
| out_channels * 2, | |||||
| out_channels, | |||||
| expand, | |||||
| kernel_size=kernel_size, | |||||
| num_blocks=num_blocks, | |||||
| use_res=use_res, | |||||
| activation=activation, | |||||
| )) | |||||
| # build bottom-up blocks | |||||
| self.downsamples = nn.ModuleList() | |||||
| self.bottom_up_blocks = nn.ModuleList() | |||||
| for idx in range(len(in_channels) - 1): | |||||
| self.downsamples.append( | |||||
| conv( | |||||
| out_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=2, | |||||
| padding=kernel_size // 2, | |||||
| norm_cfg=norm_cfg, | |||||
| activation=activation, | |||||
| )) | |||||
| self.bottom_up_blocks.append( | |||||
| GhostBlocks( | |||||
| out_channels * 2, | |||||
| out_channels, | |||||
| expand, | |||||
| kernel_size=kernel_size, | |||||
| num_blocks=num_blocks, | |||||
| use_res=use_res, | |||||
| activation=activation, | |||||
| )) | |||||
| # extra layers | |||||
| self.extra_lvl_in_conv = nn.ModuleList() | |||||
| self.extra_lvl_out_conv = nn.ModuleList() | |||||
| for i in range(num_extra_level): | |||||
| self.extra_lvl_in_conv.append( | |||||
| conv( | |||||
| out_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=2, | |||||
| padding=kernel_size // 2, | |||||
| norm_cfg=norm_cfg, | |||||
| activation=activation, | |||||
| )) | |||||
| self.extra_lvl_out_conv.append( | |||||
| conv( | |||||
| out_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=2, | |||||
| padding=kernel_size // 2, | |||||
| norm_cfg=norm_cfg, | |||||
| activation=activation, | |||||
| )) | |||||
| def forward(self, inputs): | |||||
| """ | |||||
| Args: | |||||
| inputs (tuple[Tensor]): input features. | |||||
| Returns: | |||||
| tuple[Tensor]: multi level features. | |||||
| """ | |||||
| assert len(inputs) == len(self.in_channels) | |||||
| inputs = [ | |||||
| reduce(input_x) | |||||
| for input_x, reduce in zip(inputs, self.reduce_layers) | |||||
| ] | |||||
| # top-down path | |||||
| inner_outs = [inputs[-1]] | |||||
| for idx in range(len(self.in_channels) - 1, 0, -1): | |||||
| feat_heigh = inner_outs[0] | |||||
| feat_low = inputs[idx - 1] | |||||
| inner_outs[0] = feat_heigh | |||||
| upsample_feat = self.upsample(feat_heigh) | |||||
| inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( | |||||
| torch.cat([upsample_feat, feat_low], 1)) | |||||
| inner_outs.insert(0, inner_out) | |||||
| # bottom-up path | |||||
| outs = [inner_outs[0]] | |||||
| for idx in range(len(self.in_channels) - 1): | |||||
| feat_low = outs[-1] | |||||
| feat_height = inner_outs[idx + 1] | |||||
| downsample_feat = self.downsamples[idx](feat_low) | |||||
| out = self.bottom_up_blocks[idx]( | |||||
| torch.cat([downsample_feat, feat_height], 1)) | |||||
| outs.append(out) | |||||
| # extra layers | |||||
| for extra_in_layer, extra_out_layer in zip(self.extra_lvl_in_conv, | |||||
| self.extra_lvl_out_conv): | |||||
| outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1])) | |||||
| return tuple(outs) | |||||
| @@ -0,0 +1,427 @@ | |||||
| # The implementation here is modified based on nanodet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||||
| import math | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from torchvision.ops import nms | |||||
| from .utils import ConvModule, DepthwiseConvModule | |||||
| class Integral(nn.Module): | |||||
| """A fixed layer for calculating integral result from distribution. | |||||
| This layer calculates the target location by :math: `sum{P(y_i) * y_i}`, | |||||
| P(y_i) denotes the softmax vector that represents the discrete distribution | |||||
| y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max} | |||||
| Args: | |||||
| reg_max (int): The maximal value of the discrete set. Default: 16. You | |||||
| may want to reset it according to your new dataset or related | |||||
| settings. | |||||
| """ | |||||
| def __init__(self, reg_max=16): | |||||
| super(Integral, self).__init__() | |||||
| self.reg_max = reg_max | |||||
| self.register_buffer('project', | |||||
| torch.linspace(0, self.reg_max, self.reg_max + 1)) | |||||
| def forward(self, x): | |||||
| """Forward feature from the regression head to get integral result of | |||||
| bounding box location. | |||||
| Args: | |||||
| x (Tensor): Features of the regression head, shape (N, 4*(n+1)), | |||||
| n is self.reg_max. | |||||
| Returns: | |||||
| x (Tensor): Integral result of box locations, i.e., distance | |||||
| offsets from the box center in four directions, shape (N, 4). | |||||
| """ | |||||
| shape = x.size() | |||||
| x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1) | |||||
| x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4) | |||||
| return x | |||||
| def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False): | |||||
| """Performs non-maximum suppression in a batched fashion. | |||||
| Modified from https://github.com/pytorch/vision/blob | |||||
| /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39. | |||||
| In order to perform NMS independently per class, we add an offset to all | |||||
| the boxes. The offset is dependent only on the class idx, and is large | |||||
| enough so that boxes from different classes do not overlap. | |||||
| Arguments: | |||||
| boxes (torch.Tensor): boxes in shape (N, 4). | |||||
| scores (torch.Tensor): scores in shape (N, ). | |||||
| idxs (torch.Tensor): each index value correspond to a bbox cluster, | |||||
| and NMS will not be applied between elements of different idxs, | |||||
| shape (N, ). | |||||
| nms_cfg (dict): specify nms type and other parameters like iou_thr. | |||||
| Possible keys includes the following. | |||||
| - iou_thr (float): IoU threshold used for NMS. | |||||
| - split_thr (float): threshold number of boxes. In some cases the | |||||
| number of boxes is large (e.g., 200k). To avoid OOM during | |||||
| training, the users could set `split_thr` to a small value. | |||||
| If the number of boxes is greater than the threshold, it will | |||||
| perform NMS on each group of boxes separately and sequentially. | |||||
| Defaults to 10000. | |||||
| class_agnostic (bool): if true, nms is class agnostic, | |||||
| i.e. IoU thresholding happens over all boxes, | |||||
| regardless of the predicted class. | |||||
| Returns: | |||||
| tuple: kept dets and indice. | |||||
| """ | |||||
| nms_cfg_ = nms_cfg.copy() | |||||
| class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic) | |||||
| if class_agnostic: | |||||
| boxes_for_nms = boxes | |||||
| else: | |||||
| max_coordinate = boxes.max() | |||||
| offsets = idxs.to(boxes) * (max_coordinate + 1) | |||||
| boxes_for_nms = boxes + offsets[:, None] | |||||
| nms_cfg_.pop('type', 'nms') | |||||
| split_thr = nms_cfg_.pop('split_thr', 10000) | |||||
| if len(boxes_for_nms) < split_thr: | |||||
| keep = nms(boxes_for_nms, scores, **nms_cfg_) | |||||
| boxes = boxes[keep] | |||||
| scores = scores[keep] | |||||
| else: | |||||
| total_mask = scores.new_zeros(scores.size(), dtype=torch.bool) | |||||
| for id in torch.unique(idxs): | |||||
| mask = (idxs == id).nonzero(as_tuple=False).view(-1) | |||||
| keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_) | |||||
| total_mask[mask[keep]] = True | |||||
| keep = total_mask.nonzero(as_tuple=False).view(-1) | |||||
| keep = keep[scores[keep].argsort(descending=True)] | |||||
| boxes = boxes[keep] | |||||
| scores = scores[keep] | |||||
| return torch.cat([boxes, scores[:, None]], -1), keep | |||||
| def multiclass_nms(multi_bboxes, | |||||
| multi_scores, | |||||
| score_thr, | |||||
| nms_cfg, | |||||
| max_num=-1, | |||||
| score_factors=None): | |||||
| """NMS for multi-class bboxes. | |||||
| Args: | |||||
| multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |||||
| multi_scores (Tensor): shape (n, #class), where the last column | |||||
| contains scores of the background class, but this will be ignored. | |||||
| score_thr (float): bbox threshold, bboxes with scores lower than it | |||||
| will not be considered. | |||||
| nms_thr (float): NMS IoU threshold | |||||
| max_num (int): if there are more than max_num bboxes after NMS, | |||||
| only top max_num will be kept. | |||||
| score_factors (Tensor): The factors multiplied to scores before | |||||
| applying NMS | |||||
| Returns: | |||||
| tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ | |||||
| are 0-based. | |||||
| """ | |||||
| num_classes = multi_scores.size(1) - 1 | |||||
| if multi_bboxes.shape[1] > 4: | |||||
| bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | |||||
| else: | |||||
| bboxes = multi_bboxes[:, None].expand( | |||||
| multi_scores.size(0), num_classes, 4) | |||||
| scores = multi_scores[:, :-1] | |||||
| valid_mask = scores > score_thr | |||||
| bboxes = torch.masked_select( | |||||
| bboxes, | |||||
| torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), | |||||
| -1)).view(-1, 4) | |||||
| if score_factors is not None: | |||||
| scores = scores * score_factors[:, None] | |||||
| scores = torch.masked_select(scores, valid_mask) | |||||
| labels = valid_mask.nonzero(as_tuple=False)[:, 1] | |||||
| if bboxes.numel() == 0: | |||||
| bboxes = multi_bboxes.new_zeros((0, 5)) | |||||
| labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) | |||||
| if torch.onnx.is_in_onnx_export(): | |||||
| raise RuntimeError('[ONNX Error] Can not record NMS ' | |||||
| 'as it has not been executed this time') | |||||
| return bboxes, labels | |||||
| dets, keep = batched_nms(bboxes, scores, labels, nms_cfg) | |||||
| if max_num > 0: | |||||
| dets = dets[:max_num] | |||||
| keep = keep[:max_num] | |||||
| return dets, labels[keep] | |||||
| def distance2bbox(points, distance, max_shape=None): | |||||
| """Decode distance prediction to bounding box. | |||||
| Args: | |||||
| points (Tensor): Shape (n, 2), [x, y]. | |||||
| distance (Tensor): Distance from the given point to 4 | |||||
| boundaries (left, top, right, bottom). | |||||
| max_shape (tuple): Shape of the image. | |||||
| Returns: | |||||
| Tensor: Decoded bboxes. | |||||
| """ | |||||
| x1 = points[..., 0] - distance[..., 0] | |||||
| y1 = points[..., 1] - distance[..., 1] | |||||
| x2 = points[..., 0] + distance[..., 2] | |||||
| y2 = points[..., 1] + distance[..., 3] | |||||
| if max_shape is not None: | |||||
| x1 = x1.clamp(min=0, max=max_shape[1]) | |||||
| y1 = y1.clamp(min=0, max=max_shape[0]) | |||||
| x2 = x2.clamp(min=0, max=max_shape[1]) | |||||
| y2 = y2.clamp(min=0, max=max_shape[0]) | |||||
| return torch.stack([x1, y1, x2, y2], -1) | |||||
| def warp_boxes(boxes, M, width, height): | |||||
| n = len(boxes) | |||||
| if n: | |||||
| xy = np.ones((n * 4, 3)) | |||||
| xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) | |||||
| xy = xy @ M.T | |||||
| xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) | |||||
| x = xy[:, [0, 2, 4, 6]] | |||||
| y = xy[:, [1, 3, 5, 7]] | |||||
| xy = np.concatenate( | |||||
| (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T | |||||
| xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) | |||||
| xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) | |||||
| return xy.astype(np.float32) | |||||
| else: | |||||
| return boxes | |||||
| class NanoDetPlusHead(nn.Module): | |||||
| """Detection head used in NanoDet-Plus. | |||||
| Args: | |||||
| num_classes (int): Number of categories excluding the background | |||||
| category. | |||||
| loss (dict): Loss config. | |||||
| input_channel (int): Number of channels of the input feature. | |||||
| feat_channels (int): Number of channels of the feature. | |||||
| Default: 96. | |||||
| stacked_convs (int): Number of conv layers in the stacked convs. | |||||
| Default: 2. | |||||
| kernel_size (int): Size of the convolving kernel. Default: 5. | |||||
| strides (list[int]): Strides of input multi-level feature maps. | |||||
| Default: [8, 16, 32]. | |||||
| conv_type (str): Type of the convolution. | |||||
| Default: "DWConv". | |||||
| norm_cfg (dict): Dictionary to construct and config norm layer. | |||||
| Default: dict(type='BN'). | |||||
| reg_max (int): The maximal value of the discrete set. Default: 7. | |||||
| activation (str): Type of activation function. Default: "LeakyReLU". | |||||
| assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13). | |||||
| """ | |||||
| def __init__(self, | |||||
| num_classes, | |||||
| input_channel, | |||||
| feat_channels=96, | |||||
| stacked_convs=2, | |||||
| kernel_size=5, | |||||
| strides=[8, 16, 32], | |||||
| conv_type='DWConv', | |||||
| norm_cfg=dict(type='BN'), | |||||
| reg_max=7, | |||||
| activation='LeakyReLU', | |||||
| assigner_cfg=dict(topk=13), | |||||
| **kwargs): | |||||
| super(NanoDetPlusHead, self).__init__() | |||||
| self.num_classes = num_classes | |||||
| self.in_channels = input_channel | |||||
| self.feat_channels = feat_channels | |||||
| self.stacked_convs = stacked_convs | |||||
| self.kernel_size = kernel_size | |||||
| self.strides = strides | |||||
| self.reg_max = reg_max | |||||
| self.activation = activation | |||||
| self.ConvModule = ConvModule if conv_type == 'Conv' else DepthwiseConvModule | |||||
| self.norm_cfg = norm_cfg | |||||
| self.distribution_project = Integral(self.reg_max) | |||||
| self._init_layers() | |||||
| def _init_layers(self): | |||||
| self.cls_convs = nn.ModuleList() | |||||
| for _ in self.strides: | |||||
| cls_convs = self._buid_not_shared_head() | |||||
| self.cls_convs.append(cls_convs) | |||||
| self.gfl_cls = nn.ModuleList([ | |||||
| nn.Conv2d( | |||||
| self.feat_channels, | |||||
| self.num_classes + 4 * (self.reg_max + 1), | |||||
| 1, | |||||
| padding=0, | |||||
| ) for _ in self.strides | |||||
| ]) | |||||
| def _buid_not_shared_head(self): | |||||
| cls_convs = nn.ModuleList() | |||||
| for i in range(self.stacked_convs): | |||||
| chn = self.in_channels if i == 0 else self.feat_channels | |||||
| cls_convs.append( | |||||
| self.ConvModule( | |||||
| chn, | |||||
| self.feat_channels, | |||||
| self.kernel_size, | |||||
| stride=1, | |||||
| padding=self.kernel_size // 2, | |||||
| norm_cfg=self.norm_cfg, | |||||
| bias=self.norm_cfg is None, | |||||
| activation=self.activation, | |||||
| )) | |||||
| return cls_convs | |||||
| def forward(self, feats): | |||||
| if torch.onnx.is_in_onnx_export(): | |||||
| return self._forward_onnx(feats) | |||||
| outputs = [] | |||||
| for feat, cls_convs, gfl_cls in zip( | |||||
| feats, | |||||
| self.cls_convs, | |||||
| self.gfl_cls, | |||||
| ): | |||||
| for conv in cls_convs: | |||||
| feat = conv(feat) | |||||
| output = gfl_cls(feat) | |||||
| outputs.append(output.flatten(start_dim=2)) | |||||
| outputs = torch.cat(outputs, dim=2).permute(0, 2, 1) | |||||
| return outputs | |||||
| def post_process(self, preds, meta): | |||||
| """Prediction results post processing. Decode bboxes and rescale | |||||
| to original image size. | |||||
| Args: | |||||
| preds (Tensor): Prediction output. | |||||
| meta (dict): Meta info. | |||||
| """ | |||||
| cls_scores, bbox_preds = preds.split( | |||||
| [self.num_classes, 4 * (self.reg_max + 1)], dim=-1) | |||||
| result_list = self.get_bboxes(cls_scores, bbox_preds, meta) | |||||
| det_results = {} | |||||
| warp_matrixes = ( | |||||
| meta['warp_matrix'] | |||||
| if isinstance(meta['warp_matrix'], list) else meta['warp_matrix']) | |||||
| img_heights = ( | |||||
| meta['img_info']['height'].cpu().numpy() if isinstance( | |||||
| meta['img_info']['height'], torch.Tensor) else | |||||
| meta['img_info']['height']) | |||||
| img_widths = ( | |||||
| meta['img_info']['width'].cpu().numpy() if isinstance( | |||||
| meta['img_info']['width'], torch.Tensor) else | |||||
| meta['img_info']['width']) | |||||
| img_ids = ( | |||||
| meta['img_info']['id'].cpu().numpy() if isinstance( | |||||
| meta['img_info']['id'], torch.Tensor) else | |||||
| meta['img_info']['id']) | |||||
| for result, img_width, img_height, img_id, warp_matrix in zip( | |||||
| result_list, img_widths, img_heights, img_ids, warp_matrixes): | |||||
| det_result = {} | |||||
| det_bboxes, det_labels = result | |||||
| det_bboxes = det_bboxes.detach().cpu().numpy() | |||||
| det_bboxes[:, :4] = warp_boxes(det_bboxes[:, :4], | |||||
| np.linalg.inv(warp_matrix), | |||||
| img_width, img_height) | |||||
| classes = det_labels.detach().cpu().numpy() | |||||
| for i in range(self.num_classes): | |||||
| inds = classes == i | |||||
| det_result[i] = np.concatenate( | |||||
| [ | |||||
| det_bboxes[inds, :4].astype(np.float32), | |||||
| det_bboxes[inds, 4:5].astype(np.float32), | |||||
| ], | |||||
| axis=1, | |||||
| ).tolist() | |||||
| det_results[img_id] = det_result | |||||
| return det_results | |||||
| def get_bboxes(self, cls_preds, reg_preds, img_metas): | |||||
| """Decode the outputs to bboxes. | |||||
| Args: | |||||
| cls_preds (Tensor): Shape (num_imgs, num_points, num_classes). | |||||
| reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)). | |||||
| img_metas (dict): Dict of image info. | |||||
| Returns: | |||||
| results_list (list[tuple]): List of detection bboxes and labels. | |||||
| """ | |||||
| device = cls_preds.device | |||||
| b = cls_preds.shape[0] | |||||
| input_height, input_width = img_metas['img'].shape[2:] | |||||
| input_shape = (input_height, input_width) | |||||
| featmap_sizes = [(math.ceil(input_height / stride), | |||||
| math.ceil(input_width) / stride) | |||||
| for stride in self.strides] | |||||
| mlvl_center_priors = [ | |||||
| self.get_single_level_center_priors( | |||||
| b, | |||||
| featmap_sizes[i], | |||||
| stride, | |||||
| dtype=torch.float32, | |||||
| device=device, | |||||
| ) for i, stride in enumerate(self.strides) | |||||
| ] | |||||
| center_priors = torch.cat(mlvl_center_priors, dim=1) | |||||
| dis_preds = self.distribution_project(reg_preds) * center_priors[..., | |||||
| 2, | |||||
| None] | |||||
| bboxes = distance2bbox( | |||||
| center_priors[..., :2], dis_preds, max_shape=input_shape) | |||||
| scores = cls_preds.sigmoid() | |||||
| result_list = [] | |||||
| for i in range(b): | |||||
| score, bbox = scores[i], bboxes[i] | |||||
| padding = score.new_zeros(score.shape[0], 1) | |||||
| score = torch.cat([score, padding], dim=1) | |||||
| results = multiclass_nms( | |||||
| bbox, | |||||
| score, | |||||
| score_thr=0.05, | |||||
| nms_cfg=dict(type='nms', iou_threshold=0.6), | |||||
| max_num=100, | |||||
| ) | |||||
| result_list.append(results) | |||||
| return result_list | |||||
| def get_single_level_center_priors(self, batch_size, featmap_size, stride, | |||||
| dtype, device): | |||||
| """Generate centers of a single stage feature map. | |||||
| Args: | |||||
| batch_size (int): Number of images in one batch. | |||||
| featmap_size (tuple[int]): height and width of the feature map | |||||
| stride (int): down sample stride of the feature map | |||||
| dtype (obj:`torch.dtype`): data type of the tensors | |||||
| device (obj:`torch.device`): device of the tensors | |||||
| Return: | |||||
| priors (Tensor): center priors of a single level feature map. | |||||
| """ | |||||
| h, w = featmap_size | |||||
| x_range = (torch.arange(w, dtype=dtype, device=device)) * stride | |||||
| y_range = (torch.arange(h, dtype=dtype, device=device)) * stride | |||||
| y, x = torch.meshgrid(y_range, x_range) | |||||
| y = y.flatten() | |||||
| x = x.flatten() | |||||
| strides = x.new_full((x.shape[0], ), stride) | |||||
| proiors = torch.stack([x, y, strides, strides], dim=-1) | |||||
| return proiors.unsqueeze(0).repeat(batch_size, 1, 1) | |||||
| @@ -0,0 +1,64 @@ | |||||
| # The implementation here is modified based on nanodet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from .ghost_pan import GhostPAN | |||||
| from .nanodet_plus_head import NanoDetPlusHead | |||||
| from .shufflenetv2 import ShuffleNetV2 | |||||
| class OneStageDetector(nn.Module): | |||||
| def __init__(self): | |||||
| super(OneStageDetector, self).__init__() | |||||
| self.backbone = ShuffleNetV2( | |||||
| model_size='1.0x', | |||||
| out_stages=(2, 3, 4), | |||||
| with_last_conv=False, | |||||
| kernal_size=3, | |||||
| activation='LeakyReLU', | |||||
| pretrain=False) | |||||
| self.fpn = GhostPAN( | |||||
| in_channels=[116, 232, 464], | |||||
| out_channels=96, | |||||
| use_depthwise=True, | |||||
| kernel_size=5, | |||||
| expand=1, | |||||
| num_blocks=1, | |||||
| use_res=False, | |||||
| num_extra_level=1, | |||||
| upsample_cfg=dict(scale_factor=2, mode='bilinear'), | |||||
| norm_cfg=dict(type='BN'), | |||||
| activation='LeakyReLU') | |||||
| self.head = NanoDetPlusHead( | |||||
| num_classes=3, | |||||
| input_channel=96, | |||||
| feat_channels=96, | |||||
| stacked_convs=2, | |||||
| kernel_size=5, | |||||
| strides=[8, 16, 32, 64], | |||||
| conv_type='DWConv', | |||||
| norm_cfg=dict(type='BN'), | |||||
| reg_max=7, | |||||
| activation='LeakyReLU', | |||||
| assigner_cfg=dict(topk=13)) | |||||
| self.epoch = 0 | |||||
| def forward(self, x): | |||||
| x = self.backbone(x) | |||||
| if hasattr(self, 'fpn'): | |||||
| x = self.fpn(x) | |||||
| if hasattr(self, 'head'): | |||||
| x = self.head(x) | |||||
| return x | |||||
| def inference(self, meta): | |||||
| with torch.no_grad(): | |||||
| torch.cuda.synchronize() | |||||
| preds = self(meta['img']) | |||||
| torch.cuda.synchronize() | |||||
| results = self.head.post_process(preds, meta) | |||||
| torch.cuda.synchronize() | |||||
| return results | |||||
| @@ -0,0 +1,182 @@ | |||||
| # The implementation here is modified based on nanodet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from .utils import act_layers | |||||
| def channel_shuffle(x, groups): | |||||
| batchsize, num_channels, height, width = x.data.size() | |||||
| channels_per_group = num_channels // groups | |||||
| x = x.view(batchsize, groups, channels_per_group, height, width) | |||||
| x = torch.transpose(x, 1, 2).contiguous() | |||||
| x = x.view(batchsize, -1, height, width) | |||||
| return x | |||||
| class ShuffleV2Block(nn.Module): | |||||
| def __init__(self, inp, oup, stride, activation='ReLU'): | |||||
| super(ShuffleV2Block, self).__init__() | |||||
| if not (1 <= stride <= 3): | |||||
| raise ValueError('illegal stride value') | |||||
| self.stride = stride | |||||
| branch_features = oup // 2 | |||||
| assert (self.stride != 1) or (inp == branch_features << 1) | |||||
| if self.stride > 1: | |||||
| self.branch1 = nn.Sequential( | |||||
| self.depthwise_conv( | |||||
| inp, inp, kernel_size=3, stride=self.stride, padding=1), | |||||
| nn.BatchNorm2d(inp), | |||||
| nn.Conv2d( | |||||
| inp, | |||||
| branch_features, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| bias=False), | |||||
| nn.BatchNorm2d(branch_features), | |||||
| act_layers(activation), | |||||
| ) | |||||
| else: | |||||
| self.branch1 = nn.Sequential() | |||||
| self.branch2 = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| inp if (self.stride > 1) else branch_features, | |||||
| branch_features, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| bias=False, | |||||
| ), | |||||
| nn.BatchNorm2d(branch_features), | |||||
| act_layers(activation), | |||||
| self.depthwise_conv( | |||||
| branch_features, | |||||
| branch_features, | |||||
| kernel_size=3, | |||||
| stride=self.stride, | |||||
| padding=1, | |||||
| ), | |||||
| nn.BatchNorm2d(branch_features), | |||||
| nn.Conv2d( | |||||
| branch_features, | |||||
| branch_features, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| bias=False, | |||||
| ), | |||||
| nn.BatchNorm2d(branch_features), | |||||
| act_layers(activation), | |||||
| ) | |||||
| @staticmethod | |||||
| def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): | |||||
| return nn.Conv2d( | |||||
| i, o, kernel_size, stride, padding, bias=bias, groups=i) | |||||
| def forward(self, x): | |||||
| if self.stride == 1: | |||||
| x1, x2 = x.chunk(2, dim=1) | |||||
| out = torch.cat((x1, self.branch2(x2)), dim=1) | |||||
| else: | |||||
| out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) | |||||
| out = channel_shuffle(out, 2) | |||||
| return out | |||||
| class ShuffleNetV2(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| model_size='1.5x', | |||||
| out_stages=(2, 3, 4), | |||||
| with_last_conv=False, | |||||
| kernal_size=3, | |||||
| activation='ReLU', | |||||
| pretrain=True, | |||||
| ): | |||||
| super(ShuffleNetV2, self).__init__() | |||||
| assert set(out_stages).issubset((2, 3, 4)) | |||||
| print('model size is ', model_size) | |||||
| self.stage_repeats = [4, 8, 4] | |||||
| self.model_size = model_size | |||||
| self.out_stages = out_stages | |||||
| self.with_last_conv = with_last_conv | |||||
| self.kernal_size = kernal_size | |||||
| self.activation = activation | |||||
| if model_size == '0.5x': | |||||
| self._stage_out_channels = [24, 48, 96, 192, 1024] | |||||
| elif model_size == '1.0x': | |||||
| self._stage_out_channels = [24, 116, 232, 464, 1024] | |||||
| elif model_size == '1.5x': | |||||
| self._stage_out_channels = [24, 176, 352, 704, 1024] | |||||
| elif model_size == '2.0x': | |||||
| self._stage_out_channels = [24, 244, 488, 976, 2048] | |||||
| else: | |||||
| raise NotImplementedError | |||||
| # building first layer | |||||
| input_channels = 3 | |||||
| output_channels = self._stage_out_channels[0] | |||||
| self.conv1 = nn.Sequential( | |||||
| nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False), | |||||
| nn.BatchNorm2d(output_channels), | |||||
| act_layers(activation), | |||||
| ) | |||||
| input_channels = output_channels | |||||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||||
| stage_names = ['stage{}'.format(i) for i in [2, 3, 4]] | |||||
| for name, repeats, output_channels in zip( | |||||
| stage_names, self.stage_repeats, self._stage_out_channels[1:]): | |||||
| seq = [ | |||||
| ShuffleV2Block( | |||||
| input_channels, output_channels, 2, activation=activation) | |||||
| ] | |||||
| for i in range(repeats - 1): | |||||
| seq.append( | |||||
| ShuffleV2Block( | |||||
| output_channels, | |||||
| output_channels, | |||||
| 1, | |||||
| activation=activation)) | |||||
| setattr(self, name, nn.Sequential(*seq)) | |||||
| input_channels = output_channels | |||||
| output_channels = self._stage_out_channels[-1] | |||||
| if self.with_last_conv: | |||||
| conv5 = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| input_channels, output_channels, 1, 1, 0, bias=False), | |||||
| nn.BatchNorm2d(output_channels), | |||||
| act_layers(activation), | |||||
| ) | |||||
| self.stage4.add_module('conv5', conv5) | |||||
| def forward(self, x): | |||||
| x = self.conv1(x) | |||||
| x = self.maxpool(x) | |||||
| output = [] | |||||
| for i in range(2, 5): | |||||
| stage = getattr(self, 'stage{}'.format(i)) | |||||
| x = stage(x) | |||||
| if i in self.out_stages: | |||||
| output.append(x) | |||||
| return tuple(output) | |||||
| @@ -0,0 +1,277 @@ | |||||
| # The implementation here is modified based on nanodet, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| activations = { | |||||
| 'ReLU': nn.ReLU, | |||||
| 'LeakyReLU': nn.LeakyReLU, | |||||
| 'ReLU6': nn.ReLU6, | |||||
| 'SELU': nn.SELU, | |||||
| 'ELU': nn.ELU, | |||||
| 'GELU': nn.GELU, | |||||
| 'PReLU': nn.PReLU, | |||||
| 'SiLU': nn.SiLU, | |||||
| 'HardSwish': nn.Hardswish, | |||||
| 'Hardswish': nn.Hardswish, | |||||
| None: nn.Identity, | |||||
| } | |||||
| def act_layers(name): | |||||
| assert name in activations.keys() | |||||
| if name == 'LeakyReLU': | |||||
| return nn.LeakyReLU(negative_slope=0.1, inplace=True) | |||||
| elif name == 'GELU': | |||||
| return nn.GELU() | |||||
| elif name == 'PReLU': | |||||
| return nn.PReLU() | |||||
| else: | |||||
| return activations[name](inplace=True) | |||||
| norm_cfg = { | |||||
| 'BN': ('bn', nn.BatchNorm2d), | |||||
| 'SyncBN': ('bn', nn.SyncBatchNorm), | |||||
| 'GN': ('gn', nn.GroupNorm), | |||||
| } | |||||
| def build_norm_layer(cfg, num_features, postfix=''): | |||||
| """Build normalization layer | |||||
| Args: | |||||
| cfg (dict): cfg should contain: | |||||
| type (str): identify norm layer type. | |||||
| layer args: args needed to instantiate a norm layer. | |||||
| requires_grad (bool): [optional] whether stop gradient updates | |||||
| num_features (int): number of channels from input. | |||||
| postfix (int, str): appended into norm abbreviation to | |||||
| create named layer. | |||||
| Returns: | |||||
| name (str): abbreviation + postfix | |||||
| layer (nn.Module): created norm layer | |||||
| """ | |||||
| assert isinstance(cfg, dict) and 'type' in cfg | |||||
| cfg_ = cfg.copy() | |||||
| layer_type = cfg_.pop('type') | |||||
| if layer_type not in norm_cfg: | |||||
| raise KeyError('Unrecognized norm type {}'.format(layer_type)) | |||||
| else: | |||||
| abbr, norm_layer = norm_cfg[layer_type] | |||||
| if norm_layer is None: | |||||
| raise NotImplementedError | |||||
| assert isinstance(postfix, (int, str)) | |||||
| name = abbr + str(postfix) | |||||
| requires_grad = cfg_.pop('requires_grad', True) | |||||
| cfg_.setdefault('eps', 1e-5) | |||||
| if layer_type != 'GN': | |||||
| layer = norm_layer(num_features, **cfg_) | |||||
| if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): | |||||
| layer._specify_ddp_gpu_num(1) | |||||
| else: | |||||
| assert 'num_groups' in cfg_ | |||||
| layer = norm_layer(num_channels=num_features, **cfg_) | |||||
| for param in layer.parameters(): | |||||
| param.requires_grad = requires_grad | |||||
| return name, layer | |||||
| class ConvModule(nn.Module): | |||||
| """A conv block that contains conv/norm/activation layers. | |||||
| Args: | |||||
| in_channels (int): Same as nn.Conv2d. | |||||
| out_channels (int): Same as nn.Conv2d. | |||||
| kernel_size (int or tuple[int]): Same as nn.Conv2d. | |||||
| stride (int or tuple[int]): Same as nn.Conv2d. | |||||
| padding (int or tuple[int]): Same as nn.Conv2d. | |||||
| dilation (int or tuple[int]): Same as nn.Conv2d. | |||||
| groups (int): Same as nn.Conv2d. | |||||
| bias (bool or str): If specified as `auto`, it will be decided by the | |||||
| norm_cfg. Bias will be set as True if norm_cfg is None, otherwise | |||||
| False. | |||||
| conv_cfg (dict): Config dict for convolution layer. | |||||
| norm_cfg (dict): Config dict for normalization layer. | |||||
| activation (str): activation layer, "ReLU" by default. | |||||
| inplace (bool): Whether to use inplace mode for activation. | |||||
| order (tuple[str]): The order of conv/norm/activation layers. It is a | |||||
| sequence of "conv", "norm" and "act". Examples are | |||||
| ("conv", "norm", "act") and ("act", "conv", "norm"). | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| padding=0, | |||||
| dilation=1, | |||||
| groups=1, | |||||
| bias='auto', | |||||
| conv_cfg=None, | |||||
| norm_cfg=None, | |||||
| activation='ReLU', | |||||
| inplace=True, | |||||
| order=('conv', 'norm', 'act'), | |||||
| ): | |||||
| super(ConvModule, self).__init__() | |||||
| assert conv_cfg is None or isinstance(conv_cfg, dict) | |||||
| assert norm_cfg is None or isinstance(norm_cfg, dict) | |||||
| assert activation is None or isinstance(activation, str) | |||||
| self.conv_cfg = conv_cfg | |||||
| self.norm_cfg = norm_cfg | |||||
| self.activation = activation | |||||
| self.inplace = inplace | |||||
| self.order = order | |||||
| assert isinstance(self.order, tuple) and len(self.order) == 3 | |||||
| assert set(order) == {'conv', 'norm', 'act'} | |||||
| self.with_norm = norm_cfg is not None | |||||
| if bias == 'auto': | |||||
| bias = False if self.with_norm else True | |||||
| self.with_bias = bias | |||||
| if self.with_norm and self.with_bias: | |||||
| warnings.warn('ConvModule has norm and bias at the same time') | |||||
| self.conv = nn.Conv2d( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| dilation=dilation, | |||||
| groups=groups, | |||||
| bias=bias, | |||||
| ) | |||||
| self.in_channels = self.conv.in_channels | |||||
| self.out_channels = self.conv.out_channels | |||||
| self.kernel_size = self.conv.kernel_size | |||||
| self.stride = self.conv.stride | |||||
| self.padding = self.conv.padding | |||||
| self.dilation = self.conv.dilation | |||||
| self.transposed = self.conv.transposed | |||||
| self.output_padding = self.conv.output_padding | |||||
| self.groups = self.conv.groups | |||||
| if self.with_norm: | |||||
| if order.index('norm') > order.index('conv'): | |||||
| norm_channels = out_channels | |||||
| else: | |||||
| norm_channels = in_channels | |||||
| self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) | |||||
| self.add_module(self.norm_name, norm) | |||||
| else: | |||||
| self.norm_name = None | |||||
| if self.activation: | |||||
| self.act = act_layers(self.activation) | |||||
| @property | |||||
| def norm(self): | |||||
| if self.norm_name: | |||||
| return getattr(self, self.norm_name) | |||||
| else: | |||||
| return None | |||||
| def forward(self, x, norm=True): | |||||
| for layer in self.order: | |||||
| if layer == 'conv': | |||||
| x = self.conv(x) | |||||
| elif layer == 'norm' and norm and self.with_norm: | |||||
| x = self.norm(x) | |||||
| elif layer == 'act' and self.activation: | |||||
| x = self.act(x) | |||||
| return x | |||||
| class DepthwiseConvModule(nn.Module): | |||||
| def __init__( | |||||
| self, | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| stride=1, | |||||
| padding=0, | |||||
| dilation=1, | |||||
| bias='auto', | |||||
| norm_cfg=dict(type='BN'), | |||||
| activation='ReLU', | |||||
| inplace=True, | |||||
| order=('depthwise', 'dwnorm', 'act', 'pointwise', 'pwnorm', 'act'), | |||||
| ): | |||||
| super(DepthwiseConvModule, self).__init__() | |||||
| assert activation is None or isinstance(activation, str) | |||||
| self.activation = activation | |||||
| self.inplace = inplace | |||||
| self.order = order | |||||
| assert isinstance(self.order, tuple) and len(self.order) == 6 | |||||
| assert set(order) == { | |||||
| 'depthwise', | |||||
| 'dwnorm', | |||||
| 'act', | |||||
| 'pointwise', | |||||
| 'pwnorm', | |||||
| 'act', | |||||
| } | |||||
| self.with_norm = norm_cfg is not None | |||||
| if bias == 'auto': | |||||
| bias = False if self.with_norm else True | |||||
| self.with_bias = bias | |||||
| if self.with_norm and self.with_bias: | |||||
| warnings.warn('ConvModule has norm and bias at the same time') | |||||
| self.depthwise = nn.Conv2d( | |||||
| in_channels, | |||||
| in_channels, | |||||
| kernel_size, | |||||
| stride=stride, | |||||
| padding=padding, | |||||
| dilation=dilation, | |||||
| groups=in_channels, | |||||
| bias=bias, | |||||
| ) | |||||
| self.pointwise = nn.Conv2d( | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size=1, | |||||
| stride=1, | |||||
| padding=0, | |||||
| bias=bias) | |||||
| self.in_channels = self.depthwise.in_channels | |||||
| self.out_channels = self.pointwise.out_channels | |||||
| self.kernel_size = self.depthwise.kernel_size | |||||
| self.stride = self.depthwise.stride | |||||
| self.padding = self.depthwise.padding | |||||
| self.dilation = self.depthwise.dilation | |||||
| self.transposed = self.depthwise.transposed | |||||
| self.output_padding = self.depthwise.output_padding | |||||
| if self.with_norm: | |||||
| _, self.dwnorm = build_norm_layer(norm_cfg, in_channels) | |||||
| _, self.pwnorm = build_norm_layer(norm_cfg, out_channels) | |||||
| if self.activation: | |||||
| self.act = act_layers(self.activation) | |||||
| def forward(self, x, norm=True): | |||||
| for layer_name in self.order: | |||||
| if layer_name != 'act': | |||||
| layer = self.__getattr__(layer_name) | |||||
| x = layer(x) | |||||
| elif layer_name == 'act' and self.activation: | |||||
| x = self.act(x) | |||||
| return x | |||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation here is modified based on DeOldify, originally MIT License | |||||
| # and publicly available at https://github.com/jantic/DeOldify/blob/master/deoldify/unet.py | |||||
| import numpy as np | import numpy as np | ||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation here is modified based on DeOldify, originally MIT License and | |||||
| # publicly available at https://github.com/jantic/DeOldify/blob/master/fastai/callbacks/hooks.py | |||||
| import functools | import functools | ||||
| from enum import Enum | from enum import Enum | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Part of the implementation is borrowed and modified from Face-Alignment, | |||||
| # publicly available at https://github.com/foamliu/Face-Alignment/blob/master/align_faces.py | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| from skimage import transform as trans | from skimage import transform as trans | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| import cv2 | import cv2 | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from FaceQuality, made publicly available under the MIT License | |||||
| # at https://github.com/deepcam-cn/FaceQuality/blob/master/models/model_resnet.py | |||||
| import torch | import torch | ||||
| from torch import nn | from torch import nn | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The GPEN implementation is also open-sourced by the authors, | |||||
| # and available at https://github.com/yangxy/GPEN/blob/main/face_model/gpen_model.py | |||||
| import functools | import functools | ||||
| import itertools | import itertools | ||||
| import math | import math | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import math | import math | ||||
| import os.path as osp | import os.path as osp | ||||
| from copy import deepcopy | from copy import deepcopy | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from InsightFace_Pytorch, | |||||
| # made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py | |||||
| from collections import namedtuple | from collections import namedtuple | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The GPEN implementation is also open-sourced by the authors, | |||||
| # and available at https://github.com/yangxy/GPEN/tree/main/training/loss/id_loss.py | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| import torch.nn.functional as F | import torch.nn.functional as F | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from InsightFace_Pytorch, | |||||
| # made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py | |||||
| from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear, | from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear, | ||||
| Module, PReLU, Sequential) | Module, PReLU, Sequential) | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The GPEN implementation is also open-sourced by the authors, | |||||
| # and available at https://github.com/yangxy/GPEN/blob/main/face_detect/retinaface_detection.py | |||||
| import os | import os | ||||
| import cv2 | import cv2 | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License | |||||
| # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py | |||||
| import time | import time | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License | |||||
| # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py | |||||
| from collections import OrderedDict | from collections import OrderedDict | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import random | import random | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| # APPs that facilitate the use of pretrained neural networks. | # APPs that facilitate the use of pretrained neural networks. | ||||
| import os.path as osp | import os.path as osp | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import os | import os | ||||
| import random | import random | ||||
| @@ -1,3 +1,6 @@ | |||||
| # Part of the implementation is borrowed and modified from latent-diffusion, | |||||
| # publicly avaialbe at https://github.com/CompVis/latent-diffusion. | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import math | import math | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import numpy as np | import numpy as np | ||||
| import scipy.linalg as linalg | import scipy.linalg as linalg | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import colorsys | import colorsys | ||||
| import random | import random | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import cv2 | import cv2 | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| r"""SVD of linear degradation matrices described in the paper | r"""SVD of linear degradation matrices described in the paper | ||||
| ``Denoising Diffusion Restoration Models.'' | ``Denoising Diffusion Restoration Models.'' | ||||
| @article{kawar2022denoising, | @article{kawar2022denoising, | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import base64 | import base64 | ||||
| import binascii | import binascii | ||||
| import hashlib | import hashlib | ||||
| @@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | ||||
| ]) | ]) | ||||
| self.infer_result = {'vid': [], 'sid': [], 'pred': []} | |||||
| sampling_method = self.cfg.dataset.sampling_method.name | sampling_method = self.cfg.dataset.sampling_method.name | ||||
| self.neighbor_size = self.cfg.dataset.sampling_method.params[ | self.neighbor_size = self.cfg.dataset.sampling_method.params[ | ||||
| sampling_method].neighbor_size | sampling_method].neighbor_size | ||||
| @@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| shot_num = len(sids) | shot_num = len(sids) | ||||
| cnt = shot_num // bs + 1 | cnt = shot_num // bs + 1 | ||||
| infer_sid, infer_pred = [], [] | |||||
| infer_result = {} | |||||
| for i in range(cnt): | for i in range(cnt): | ||||
| start = i * bs | start = i * bs | ||||
| end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | ||||
| @@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| input_ = torch.stack(input_) | input_ = torch.stack(input_) | ||||
| outputs = self.shared_step(input_) # shape [b,2] | outputs = self.shared_step(input_) # shape [b,2] | ||||
| prob = F.softmax(outputs, dim=1) | prob = F.softmax(outputs, dim=1) | ||||
| self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) | |||||
| self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) | |||||
| self.infer_result['pred'] = np.stack(self.infer_result['pred']) | |||||
| infer_sid.extend(sid_.cpu().detach().numpy()) | |||||
| infer_pred.extend(prob[:, 1].cpu().detach().numpy()) | |||||
| infer_result.update({'pred': np.stack(infer_pred)}) | |||||
| infer_result.update({'sid': infer_sid}) | |||||
| assert len(self.infer_result['sid']) == len(sids) | |||||
| assert len(self.infer_result['pred']) == len(inputs) | |||||
| return self.infer_result | |||||
| assert len(infer_result['sid']) == len(sids) | |||||
| assert len(infer_result['pred']) == len(inputs) | |||||
| return infer_result | |||||
| def shared_step(self, inputs): | def shared_step(self, inputs): | ||||
| with torch.no_grad(): | with torch.no_grad(): | ||||
| @@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel): | |||||
| thres = self.cfg.pipeline.save_threshold | thres = self.cfg.pipeline.save_threshold | ||||
| anno_dict = get_pred_boundary(pred_dict, thres) | anno_dict = get_pred_boundary(pred_dict, thres) | ||||
| scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) | |||||
| scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( | |||||
| self.shot2keyf, anno_dict) | |||||
| if self.cfg.pipeline.save_split_scene: | if self.cfg.pipeline.save_split_scene: | ||||
| re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | ||||
| print(f'Split scene video saved to {re_dir}') | print(f'Split scene video saved to {re_dir}') | ||||
| return len(scene_list), scene_dict_lst | |||||
| return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst | |||||
| def preprocess(self, inputs): | def preprocess(self, inputs): | ||||
| logger.info('Begin shot detect......') | logger.info('Begin shot detect......') | ||||
| @@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict): | |||||
| scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | ||||
| scene_dict_lst = [] | scene_dict_lst = [] | ||||
| shot_num = len(shot2keyf) | |||||
| shot_dict_lst = [] | |||||
| for item in shot2keyf: | |||||
| tmp = item.split(' ') | |||||
| shot_dict_lst.append({ | |||||
| 'frame': [tmp[0], tmp[1]], | |||||
| 'timestamps': [tmp[-2], tmp[-1]] | |||||
| }) | |||||
| assert len(scene_list) == len(pair_list) | assert len(scene_list) == len(pair_list) | ||||
| for scene_ind, scene_item in enumerate(scene_list): | for scene_ind, scene_item in enumerate(scene_list): | ||||
| scene_dict_lst.append({ | scene_dict_lst.append({ | ||||
| 'shot': pair_list[scene_ind], | 'shot': pair_list[scene_ind], | ||||
| 'frame': scene_item[0], | 'frame': scene_item[0], | ||||
| 'timestamp': scene_item[1] | |||||
| 'timestamps': scene_item[1] | |||||
| }) | }) | ||||
| return scene_dict_lst, scene_list | |||||
| return scene_dict_lst, scene_list, shot_num, shot_dict_lst | |||||
| def scene2video(source_movie_fn, scene_list, thres): | def scene2video(source_movie_fn, scene_list, thres): | ||||
| @@ -0,0 +1,20 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| from typing import TYPE_CHECKING | |||||
| from modelscope.utils.import_utils import LazyImportModule | |||||
| if TYPE_CHECKING: | |||||
| from .seg_infer import F3NetProductSegmentation | |||||
| else: | |||||
| _import_structure = {'seg_infer': ['F3NetProductSegmentation']} | |||||
| import sys | |||||
| sys.modules[__name__] = LazyImportModule( | |||||
| __name__, | |||||
| globals()['__file__'], | |||||
| _import_structure, | |||||
| module_spec=__spec__, | |||||
| extra_objects={}, | |||||
| ) | |||||
| @@ -0,0 +1,197 @@ | |||||
| # The implementation here is modified based on F3Net, | |||||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/weijun88/F3Net | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class Bottleneck(nn.Module): | |||||
| def __init__(self, | |||||
| inplanes, | |||||
| planes, | |||||
| stride=1, | |||||
| downsample=None, | |||||
| dilation=1): | |||||
| super(Bottleneck, self).__init__() | |||||
| self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(planes) | |||||
| self.conv2 = nn.Conv2d( | |||||
| planes, | |||||
| planes, | |||||
| kernel_size=3, | |||||
| stride=stride, | |||||
| padding=(3 * dilation - 1) // 2, | |||||
| bias=False, | |||||
| dilation=dilation) | |||||
| self.bn2 = nn.BatchNorm2d(planes) | |||||
| self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | |||||
| self.bn3 = nn.BatchNorm2d(planes * 4) | |||||
| self.downsample = downsample | |||||
| def forward(self, x): | |||||
| out = F.relu(self.bn1(self.conv1(x)), inplace=True) | |||||
| out = F.relu(self.bn2(self.conv2(out)), inplace=True) | |||||
| out = self.bn3(self.conv3(out)) | |||||
| if self.downsample is not None: | |||||
| x = self.downsample(x) | |||||
| return F.relu(out + x, inplace=True) | |||||
| class ResNet(nn.Module): | |||||
| def __init__(self): | |||||
| super(ResNet, self).__init__() | |||||
| self.inplanes = 64 | |||||
| self.conv1 = nn.Conv2d( | |||||
| 3, 64, kernel_size=7, stride=2, padding=3, bias=False) | |||||
| self.bn1 = nn.BatchNorm2d(64) | |||||
| self.layer1 = self.make_layer(64, 3, stride=1, dilation=1) | |||||
| self.layer2 = self.make_layer(128, 4, stride=2, dilation=1) | |||||
| self.layer3 = self.make_layer(256, 6, stride=2, dilation=1) | |||||
| self.layer4 = self.make_layer(512, 3, stride=2, dilation=1) | |||||
| def make_layer(self, planes, blocks, stride, dilation): | |||||
| downsample = nn.Sequential( | |||||
| nn.Conv2d( | |||||
| self.inplanes, | |||||
| planes * 4, | |||||
| kernel_size=1, | |||||
| stride=stride, | |||||
| bias=False), nn.BatchNorm2d(planes * 4)) | |||||
| layers = [ | |||||
| Bottleneck( | |||||
| self.inplanes, planes, stride, downsample, dilation=dilation) | |||||
| ] | |||||
| self.inplanes = planes * 4 | |||||
| for _ in range(1, blocks): | |||||
| layers.append(Bottleneck(self.inplanes, planes, dilation=dilation)) | |||||
| return nn.Sequential(*layers) | |||||
| def forward(self, x): | |||||
| x = x.reshape(1, 3, 448, 448) | |||||
| out1 = F.relu(self.bn1(self.conv1(x)), inplace=True) | |||||
| out1 = F.max_pool2d(out1, kernel_size=3, stride=2, padding=1) | |||||
| out2 = self.layer1(out1) | |||||
| out3 = self.layer2(out2) | |||||
| out4 = self.layer3(out3) | |||||
| out5 = self.layer4(out4) | |||||
| return out2, out3, out4, out5 | |||||
| class CFM(nn.Module): | |||||
| def __init__(self): | |||||
| super(CFM, self).__init__() | |||||
| self.conv1h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn1h = nn.BatchNorm2d(64) | |||||
| self.conv2h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn2h = nn.BatchNorm2d(64) | |||||
| self.conv3h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn3h = nn.BatchNorm2d(64) | |||||
| self.conv4h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn4h = nn.BatchNorm2d(64) | |||||
| self.conv1v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn1v = nn.BatchNorm2d(64) | |||||
| self.conv2v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn2v = nn.BatchNorm2d(64) | |||||
| self.conv3v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn3v = nn.BatchNorm2d(64) | |||||
| self.conv4v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||||
| self.bn4v = nn.BatchNorm2d(64) | |||||
| def forward(self, left, down): | |||||
| if down.size()[2:] != left.size()[2:]: | |||||
| down = F.interpolate(down, size=left.size()[2:], mode='bilinear') | |||||
| out1h = F.relu(self.bn1h(self.conv1h(left)), inplace=True) | |||||
| out2h = F.relu(self.bn2h(self.conv2h(out1h)), inplace=True) | |||||
| out1v = F.relu(self.bn1v(self.conv1v(down)), inplace=True) | |||||
| out2v = F.relu(self.bn2v(self.conv2v(out1v)), inplace=True) | |||||
| fuse = out2h * out2v | |||||
| out3h = F.relu(self.bn3h(self.conv3h(fuse)), inplace=True) + out1h | |||||
| out4h = F.relu(self.bn4h(self.conv4h(out3h)), inplace=True) | |||||
| out3v = F.relu(self.bn3v(self.conv3v(fuse)), inplace=True) + out1v | |||||
| out4v = F.relu(self.bn4v(self.conv4v(out3v)), inplace=True) | |||||
| return out4h, out4v | |||||
| class Decoder(nn.Module): | |||||
| def __init__(self): | |||||
| super(Decoder, self).__init__() | |||||
| self.cfm45 = CFM() | |||||
| self.cfm34 = CFM() | |||||
| self.cfm23 = CFM() | |||||
| def forward(self, out2h, out3h, out4h, out5v, fback=None): | |||||
| if fback is not None: | |||||
| refine5 = F.interpolate( | |||||
| fback, size=out5v.size()[2:], mode='bilinear') | |||||
| refine4 = F.interpolate( | |||||
| fback, size=out4h.size()[2:], mode='bilinear') | |||||
| refine3 = F.interpolate( | |||||
| fback, size=out3h.size()[2:], mode='bilinear') | |||||
| refine2 = F.interpolate( | |||||
| fback, size=out2h.size()[2:], mode='bilinear') | |||||
| out5v = out5v + refine5 | |||||
| out4h, out4v = self.cfm45(out4h + refine4, out5v) | |||||
| out3h, out3v = self.cfm34(out3h + refine3, out4v) | |||||
| out2h, pred = self.cfm23(out2h + refine2, out3v) | |||||
| else: | |||||
| out4h, out4v = self.cfm45(out4h, out5v) | |||||
| out3h, out3v = self.cfm34(out3h, out4v) | |||||
| out2h, pred = self.cfm23(out2h, out3v) | |||||
| return out2h, out3h, out4h, out5v, pred | |||||
| class F3Net(nn.Module): | |||||
| def __init__(self): | |||||
| super(F3Net, self).__init__() | |||||
| self.bkbone = ResNet() | |||||
| self.squeeze5 = nn.Sequential( | |||||
| nn.Conv2d(2048, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||||
| self.squeeze4 = nn.Sequential( | |||||
| nn.Conv2d(1024, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||||
| self.squeeze3 = nn.Sequential( | |||||
| nn.Conv2d(512, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||||
| self.squeeze2 = nn.Sequential( | |||||
| nn.Conv2d(256, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||||
| self.decoder1 = Decoder() | |||||
| self.decoder2 = Decoder() | |||||
| self.linearp1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||||
| self.linearp2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||||
| self.linearr2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||||
| self.linearr3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||||
| self.linearr4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||||
| self.linearr5 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||||
| def forward(self, x, shape=None): | |||||
| x = x.reshape(1, 3, 448, 448) | |||||
| out2h, out3h, out4h, out5v = self.bkbone(x) | |||||
| out2h, out3h, out4h, out5v = self.squeeze2(out2h), self.squeeze3( | |||||
| out3h), self.squeeze4(out4h), self.squeeze5(out5v) | |||||
| out2h, out3h, out4h, out5v, pred1 = self.decoder1( | |||||
| out2h, out3h, out4h, out5v) | |||||
| out2h, out3h, out4h, out5v, pred2 = self.decoder2( | |||||
| out2h, out3h, out4h, out5v, pred1) | |||||
| shape = x.size()[2:] if shape is None else shape | |||||
| pred1 = F.interpolate( | |||||
| self.linearp1(pred1), size=shape, mode='bilinear') | |||||
| pred2 = F.interpolate( | |||||
| self.linearp2(pred2), size=shape, mode='bilinear') | |||||
| out2h = F.interpolate( | |||||
| self.linearr2(out2h), size=shape, mode='bilinear') | |||||
| out3h = F.interpolate( | |||||
| self.linearr3(out3h), size=shape, mode='bilinear') | |||||
| out4h = F.interpolate( | |||||
| self.linearr4(out4h), size=shape, mode='bilinear') | |||||
| out5h = F.interpolate( | |||||
| self.linearr5(out5v), size=shape, mode='bilinear') | |||||
| return pred1, pred2, out2h, out3h, out4h, out5h | |||||
| @@ -0,0 +1,77 @@ | |||||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||||
| import cv2 | |||||
| import numpy as np | |||||
| import torch | |||||
| from PIL import Image | |||||
| from modelscope.metainfo import Models | |||||
| from modelscope.models.base import TorchModel | |||||
| from modelscope.models.builder import MODELS | |||||
| from modelscope.utils.constant import ModelFile, Tasks | |||||
| from modelscope.utils.logger import get_logger | |||||
| from .net import F3Net | |||||
| logger = get_logger() | |||||
| def load_state_dict(model_dir, device): | |||||
| _dict = torch.load( | |||||
| '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), | |||||
| map_location=device) | |||||
| state_dict = {} | |||||
| for k, v in _dict.items(): | |||||
| if k.startswith('module'): | |||||
| k = k[7:] | |||||
| state_dict[k] = v | |||||
| return state_dict | |||||
| @MODELS.register_module( | |||||
| Tasks.product_segmentation, module_name=Models.product_segmentation) | |||||
| class F3NetForProductSegmentation(TorchModel): | |||||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||||
| super().__init__( | |||||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||||
| self.model = F3Net() | |||||
| if torch.cuda.is_available(): | |||||
| self.device = 'cuda' | |||||
| logger.info('Use GPU') | |||||
| else: | |||||
| self.device = 'cpu' | |||||
| logger.info('Use CPU') | |||||
| self.params = load_state_dict(model_dir, self.device) | |||||
| self.model.load_state_dict(self.params) | |||||
| self.model.to(self.device) | |||||
| self.model.eval() | |||||
| self.model.to(self.device) | |||||
| def forward(self, x): | |||||
| pred_result = self.model(x) | |||||
| return pred_result | |||||
| mean, std = np.array([[[124.55, 118.90, | |||||
| 102.94]]]), np.array([[[56.77, 55.97, 57.50]]]) | |||||
| def inference(model, device, input_path): | |||||
| img = Image.open(input_path) | |||||
| img = np.array(img.convert('RGB')).astype(np.float32) | |||||
| img = (img - mean) / std | |||||
| img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR) | |||||
| img = torch.from_numpy(img) | |||||
| img = img.permute(2, 0, 1) | |||||
| img = img.to(device).float() | |||||
| outputs = model(img) | |||||
| out = outputs[0] | |||||
| pred = (torch.sigmoid(out[0, 0]) * 255).cpu().numpy() | |||||
| pred[pred < 20] = 0 | |||||
| pred = pred[:, :, np.newaxis] | |||||
| pred = np.round(pred) | |||||
| logger.info('Inference Done') | |||||
| return pred | |||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| import torch.nn.functional as F | import torch.nn.functional as F | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| import torch.nn.functional as F | import torch.nn.functional as F | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import warnings | import warnings | ||||
| import torch | import torch | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import time | import time | ||||
| from typing import Dict, List, Optional, Tuple, Union | from typing import Dict, List, Optional, Tuple, Union | ||||
| @@ -1,3 +1,4 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from BasicSR, made public available under the Apache 2.0 License | |||||
| # at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/arch_util.py | |||||
| import collections.abc | import collections.abc | ||||
| import math | import math | ||||
| import warnings | import warnings | ||||
| @@ -1,3 +1,5 @@ | |||||
| # The implementation is adopted from BasicSR, made public available under the Apache 2.0 License | |||||
| # at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/rrdbnet_arch.py | |||||
| import torch | import torch | ||||
| from torch import nn as nn | from torch import nn as nn | ||||
| from torch.nn import functional as F | from torch.nn import functional as F | ||||
| @@ -1 +1,3 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .model import CLIPForMultiModalEmbedding | from .model import CLIPForMultiModalEmbedding | ||||
| @@ -1,3 +1,18 @@ | |||||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| import os | import os | ||||
| from collections import OrderedDict | from collections import OrderedDict | ||||
| from typing import Any, Dict, Iterable, List, Tuple, Union | from typing import Any, Dict, Iterable, List, Tuple, Union | ||||
| @@ -543,6 +543,7 @@ class GEMMModel(nn.Module): | |||||
| img_feature, text_feature, caption = None, None, None | img_feature, text_feature, caption = None, None, None | ||||
| if captioning and image is not None: | if captioning and image is not None: | ||||
| img_feature, caption = self.model.image_to_text(image) | img_feature, caption = self.model.image_to_text(image) | ||||
| img_feature = self.parse_feat(img_feature) | |||||
| elif image is not None: | elif image is not None: | ||||
| img_feature = self.parse_feat(self.model.encode_image(image)) | img_feature = self.parse_feat(self.model.encode_image(image)) | ||||
| if text is not None: | if text is not None: | ||||
| @@ -67,7 +67,7 @@ class GEMMForMultiModalEmbedding(TorchModel): | |||||
| return img_tensor | return img_tensor | ||||
| def parse_text(self, text_str): | def parse_text(self, text_str): | ||||
| if text_str is None: | |||||
| if text_str is None or len(text_str) == 0: | |||||
| return None | return None | ||||
| if isinstance(text_str, str): | if isinstance(text_str, str): | ||||
| text_ids_tensor = self.gemm_model.tokenize(text_str) | text_ids_tensor = self.gemm_model.tokenize(text_str) | ||||
| @@ -79,9 +79,12 @@ class GEMMForMultiModalEmbedding(TorchModel): | |||||
| return text_ids_tensor.view(1, -1) | return text_ids_tensor.view(1, -1) | ||||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | ||||
| image = self.parse_image(input.get('image', input.get('img', None))) | |||||
| text = self.parse_text(input.get('text', input.get('txt', None))) | |||||
| captioning = input.get('captioning', False) is True | |||||
| image_input = input.get('image', input.get('img', None)) | |||||
| text_input = input.get('text', input.get('txt', None)) | |||||
| captioning_input = input.get('captioning', None) | |||||
| image = self.parse_image(image_input) | |||||
| text = self.parse_text(text_input) | |||||
| captioning = captioning_input is True or text_input == '' | |||||
| out = self.gemm_model(image, text, captioning) | out = self.gemm_model(image, text, captioning) | ||||
| output = { | output = { | ||||
| OutputKeys.IMG_EMBEDDING: out.get('image_feature', None), | OutputKeys.IMG_EMBEDDING: out.get('image_feature', None), | ||||
| @@ -1,4 +1,4 @@ | |||||
| # The implementation is adopated from the CLIP4Clip implementation, | |||||
| # The implementation is adopted from the CLIP4Clip implementation, | |||||
| # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | ||||
| import random | import random | ||||
| @@ -1,4 +1,4 @@ | |||||
| # The implementation is adopated from the CLIP4Clip implementation, | |||||
| # The implementation is adopted from the CLIP4Clip implementation, | |||||
| # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | ||||
| import numpy as np | import numpy as np | ||||
| @@ -1,4 +1,4 @@ | |||||
| # The implementation is adopated from the CLIP4Clip implementation, | |||||
| # The implementation is adopted from the CLIP4Clip implementation, | |||||
| # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | ||||
| import gzip | import gzip | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel | from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel | ||||
| from .tokenization_ofa import OFATokenizer, OFATokenizerZH | from .tokenization_ofa import OFATokenizer, OFATokenizerZH | ||||
| from .tokenization_ofa_fast import OFATokenizerFast, OFATokenizerZHFast | from .tokenization_ofa_fast import OFATokenizerFast, OFATokenizerZHFast | ||||
| @@ -1,3 +1,17 @@ | |||||
| # Copyright 2022 OFA-Sys Team. All rights reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| import torch | import torch | ||||
| import torch.nn as nn | import torch.nn as nn | ||||
| @@ -1 +1,2 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| from .constant import OFA_TASK_KEY_MAPPING | from .constant import OFA_TASK_KEY_MAPPING | ||||
| @@ -1,3 +1,5 @@ | |||||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||||
| import os | import os | ||||
| from typing import Any, Dict | from typing import Any, Dict | ||||
| @@ -21,6 +21,7 @@ class OutputKeys(object): | |||||
| POLYGONS = 'polygons' | POLYGONS = 'polygons' | ||||
| OUTPUT = 'output' | OUTPUT = 'output' | ||||
| OUTPUT_IMG = 'output_img' | OUTPUT_IMG = 'output_img' | ||||
| OUTPUT_VIDEO = 'output_video' | |||||
| OUTPUT_PCM = 'output_pcm' | OUTPUT_PCM = 'output_pcm' | ||||
| IMG_EMBEDDING = 'img_embedding' | IMG_EMBEDDING = 'img_embedding' | ||||
| SPO_LIST = 'spo_list' | SPO_LIST = 'spo_list' | ||||
| @@ -37,8 +38,10 @@ class OutputKeys(object): | |||||
| KWS_LIST = 'kws_list' | KWS_LIST = 'kws_list' | ||||
| HISTORY = 'history' | HISTORY = 'history' | ||||
| TIMESTAMPS = 'timestamps' | TIMESTAMPS = 'timestamps' | ||||
| SPLIT_VIDEO_NUM = 'split_video_num' | |||||
| SPLIT_META_LIST = 'split_meta_list' | |||||
| SHOT_NUM = 'shot_num' | |||||
| SCENE_NUM = 'scene_num' | |||||
| SCENE_META_LIST = 'scene_meta_list' | |||||
| SHOT_META_LIST = 'shot_meta_list' | |||||
| TASK_OUTPUTS = { | TASK_OUTPUTS = { | ||||
| @@ -218,13 +221,21 @@ TASK_OUTPUTS = { | |||||
| # 3D human body keypoints detection result for single sample | # 3D human body keypoints detection result for single sample | ||||
| # { | # { | ||||
| # "poses": [ | |||||
| # [[x, y, z]*17], | |||||
| # [[x, y, z]*17], | |||||
| # [[x, y, z]*17] | |||||
| # ] | |||||
| # "poses": [ # 3d pose coordinate in camera coordinate | |||||
| # [[x, y, z]*17], # joints of per image | |||||
| # [[x, y, z]*17], | |||||
| # ... | |||||
| # ], | |||||
| # "timestamps": [ # timestamps of all frames | |||||
| # "00:00:0.230", | |||||
| # "00:00:0.560", | |||||
| # "00:00:0.690", | |||||
| # ], | |||||
| # "output_video": "path_to_rendered_video" , this is optional | |||||
| # and is only avaialbe when the "render" option is enabled. | |||||
| # } | # } | ||||
| Tasks.body_3d_keypoints: [OutputKeys.POSES], | |||||
| Tasks.body_3d_keypoints: | |||||
| [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO], | |||||
| # 2D hand keypoints result for single sample | # 2D hand keypoints result for single sample | ||||
| # { | # { | ||||
| @@ -300,19 +311,30 @@ TASK_OUTPUTS = { | |||||
| Tasks.shop_segmentation: [OutputKeys.MASKS], | Tasks.shop_segmentation: [OutputKeys.MASKS], | ||||
| # movide scene segmentation result for a single video | # movide scene segmentation result for a single video | ||||
| # { | # { | ||||
| # "split_video_num":3, | |||||
| # "split_meta_list": | |||||
| # "shot_num":15, | |||||
| # "shot_meta_list": | |||||
| # [ | |||||
| # { | |||||
| # "frame": [start_frame, end_frame], | |||||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||||
| # | |||||
| # } | |||||
| # ] | |||||
| # "scene_num":3, | |||||
| # "scene_meta_list": | |||||
| # [ | # [ | ||||
| # { | # { | ||||
| # "shot": [0,1,2], | # "shot": [0,1,2], | ||||
| # "frame": [start_frame, end_frame], | # "frame": [start_frame, end_frame], | ||||
| # "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||||
| # } | # } | ||||
| # ] | # ] | ||||
| # | # | ||||
| # } | # } | ||||
| Tasks.movie_scene_segmentation: | |||||
| [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], | |||||
| Tasks.movie_scene_segmentation: [ | |||||
| OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM, | |||||
| OutputKeys.SCENE_META_LIST | |||||
| ], | |||||
| # ============ nlp tasks =================== | # ============ nlp tasks =================== | ||||
| @@ -649,8 +671,28 @@ TASK_OUTPUTS = { | |||||
| # 'output': ['Done' / 'Decode_Error'] | # 'output': ['Done' / 'Decode_Error'] | ||||
| # } | # } | ||||
| Tasks.video_inpainting: [OutputKeys.OUTPUT], | Tasks.video_inpainting: [OutputKeys.OUTPUT], | ||||
| # { | # { | ||||
| # 'output': ['bixin'] | # 'output': ['bixin'] | ||||
| # } | # } | ||||
| Tasks.hand_static: [OutputKeys.OUTPUT] | |||||
| Tasks.hand_static: [OutputKeys.OUTPUT], | |||||
| # 'output': [ | |||||
| # [2, 75, 287, 240, 510, 0.8335018754005432], | |||||
| # [1, 127, 83, 332, 366, 0.9175254702568054], | |||||
| # [0, 0, 0, 367, 639, 0.9693422317504883]] | |||||
| # } | |||||
| Tasks.face_human_hand_detection: [OutputKeys.OUTPUT], | |||||
| # { | |||||
| # {'output': 'Happiness', 'boxes': (203, 104, 663, 564)} | |||||
| # } | |||||
| Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES], | |||||
| # { | |||||
| # "masks": [ | |||||
| # np.array # 2D array containing only 0, 255 | |||||
| # ] | |||||
| # } | |||||
| Tasks.product_segmentation: [OutputKeys.MASKS], | |||||
| } | } | ||||