# Conflicts: # modelscope/models/multi_modal/ofa/utils/__init__.pymaster
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:712b5525e37080d33f62d6657609dbef20e843ccc04ee5c788ea11aa7c08545e | |||
| size 123341 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:8fddc7be8381eb244cd692601f1c1e6cf3484b44bb4e73df0bc7de29352eb487 | |||
| size 23889 | |||
| @@ -0,0 +1,3 @@ | |||
| version https://git-lfs.github.com/spec/v1 | |||
| oid sha256:a16038f7809127eb3e03cbae049592d193707e095309daca78f7d108d67fe4ec | |||
| size 108357 | |||
| @@ -40,6 +40,9 @@ class Models(object): | |||
| ulfd = 'ulfd' | |||
| video_inpainting = 'video-inpainting' | |||
| hand_static = 'hand-static' | |||
| face_human_hand_detection = 'face-human-hand-detection' | |||
| face_emotion = 'face-emotion' | |||
| product_segmentation = 'product-segmentation' | |||
| # EasyCV models | |||
| yolox = 'YOLOX' | |||
| @@ -179,9 +182,16 @@ class Pipelines(object): | |||
| movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' | |||
| shop_segmentation = 'shop-segmentation' | |||
| video_inpainting = 'video-inpainting' | |||
| pst_action_recognition = 'patchshift-action-recognition' | |||
| hand_static = 'hand-static' | |||
| face_human_hand_detection = 'face-human-hand-detection' | |||
| face_emotion = 'face-emotion' | |||
| product_segmentation = 'product-segmentation' | |||
| # nlp tasks | |||
| automatic_post_editing = 'automatic-post-editing' | |||
| translation_quality_estimation = 'translation-quality-estimation' | |||
| domain_classification = 'domain-classification' | |||
| sentence_similarity = 'sentence-similarity' | |||
| word_segmentation = 'word-segmentation' | |||
| part_of_speech = 'part-of-speech' | |||
| @@ -1,3 +1,5 @@ | |||
| # Part of the implementation is borrowed and modified from BasicSR, publicly available at | |||
| # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py | |||
| from typing import Dict | |||
| import numpy as np | |||
| @@ -7,11 +7,13 @@ if TYPE_CHECKING: | |||
| from .models import BaseVideoModel | |||
| from .tada_convnext import TadaConvNeXt | |||
| from .temporal_patch_shift_transformer import PatchShiftTransformer | |||
| else: | |||
| _import_structure = { | |||
| 'models': ['BaseVideoModel'], | |||
| 'tada_convnext': ['TadaConvNeXt'], | |||
| 'temporal_patch_shift_transformer': ['PatchShiftTransformer'] | |||
| } | |||
| import sys | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation. | |||
| import os | |||
| import numpy as np | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation. | |||
| cfg_128x128_15 = { | |||
| 'DATASET': { | |||
| 'TYPE': 'DAMO', | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import logging | |||
| import os.path as osp | |||
| from typing import Any, Dict, List, Union | |||
| @@ -1,4 +1,4 @@ | |||
| # The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D | |||
| # The implementation is based on VideoPose3D, available at https://github.com/facebookresearch/VideoPose3D | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||
| import numpy as np | |||
| from modelscope.models.cv.cartoon.facelib.config import config as cfg | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||
| import os | |||
| import numpy as np | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||
| import time | |||
| import cv2 | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||
| import cv2 | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine | |||
| import time | |||
| import cv2 | |||
| @@ -1,7 +1,5 @@ | |||
| """ | |||
| Created on Mon Apr 24 15:43:29 2017 | |||
| @author: zhaoy | |||
| """ | |||
| # The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch | |||
| import cv2 | |||
| import numpy as np | |||
| @@ -1,8 +1,4 @@ | |||
| """ | |||
| Created on Tue Jul 11 06:54:28 2017 | |||
| @author: zhaoyafei | |||
| """ | |||
| # The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch | |||
| import numpy as np | |||
| from numpy.linalg import inv, lstsq | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import cv2 | |||
| @@ -1 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .models.detectors import MogFaceDetector | |||
| @@ -1 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .models.detector import MtcnnFaceDetector | |||
| @@ -1 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .detection import RetinaFaceDetection | |||
| @@ -1 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .detection import UlfdFaceDetector | |||
| @@ -0,0 +1,20 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .emotion_model import EfficientNetForFaceEmotion | |||
| else: | |||
| _import_structure = {'emotion_model': ['EfficientNetForFaceEmotion']} | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1,6 @@ | |||
| # The implementation here is modified based on EfficientNet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch | |||
| from .model import VALID_MODELS, EfficientNet | |||
| from .utils import (BlockArgs, BlockDecoder, GlobalParams, efficientnet, | |||
| get_model_params) | |||
| @@ -0,0 +1,380 @@ | |||
| # The implementation here is modified based on EfficientNet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch | |||
| import torch | |||
| from torch import nn | |||
| from torch.nn import functional as F | |||
| from .utils import (MemoryEfficientSwish, Swish, calculate_output_image_size, | |||
| drop_connect, efficientnet_params, get_model_params, | |||
| get_same_padding_conv2d, load_pretrained_weights, | |||
| round_filters, round_repeats) | |||
| VALID_MODELS = ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', | |||
| 'efficientnet-b3', 'efficientnet-b4', 'efficientnet-b5', | |||
| 'efficientnet-b6', 'efficientnet-b7', 'efficientnet-b8', | |||
| 'efficientnet-l2') | |||
| class MBConvBlock(nn.Module): | |||
| def __init__(self, block_args, global_params, image_size=None): | |||
| super().__init__() | |||
| self._block_args = block_args | |||
| self._bn_mom = 1 - global_params.batch_norm_momentum | |||
| self._bn_eps = global_params.batch_norm_epsilon | |||
| self.has_se = (self._block_args.se_ratio | |||
| is not None) and (0 < self._block_args.se_ratio <= 1) | |||
| self.id_skip = block_args.id_skip | |||
| inp = self._block_args.input_filters | |||
| oup = self._block_args.input_filters * self._block_args.expand_ratio | |||
| if self._block_args.expand_ratio != 1: | |||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||
| self._expand_conv = Conv2d( | |||
| in_channels=inp, out_channels=oup, kernel_size=1, bias=False) | |||
| self._bn0 = nn.BatchNorm2d( | |||
| num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) | |||
| k = self._block_args.kernel_size | |||
| s = self._block_args.stride | |||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||
| self._depthwise_conv = Conv2d( | |||
| in_channels=oup, | |||
| out_channels=oup, | |||
| groups=oup, | |||
| kernel_size=k, | |||
| stride=s, | |||
| bias=False) | |||
| self._bn1 = nn.BatchNorm2d( | |||
| num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) | |||
| image_size = calculate_output_image_size(image_size, s) | |||
| if self.has_se: | |||
| Conv2d = get_same_padding_conv2d(image_size=(1, 1)) | |||
| num_squeezed_channels = max( | |||
| 1, | |||
| int(self._block_args.input_filters | |||
| * self._block_args.se_ratio)) | |||
| self._se_reduce = Conv2d( | |||
| in_channels=oup, | |||
| out_channels=num_squeezed_channels, | |||
| kernel_size=1) | |||
| self._se_expand = Conv2d( | |||
| in_channels=num_squeezed_channels, | |||
| out_channels=oup, | |||
| kernel_size=1) | |||
| final_oup = self._block_args.output_filters | |||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||
| self._project_conv = Conv2d( | |||
| in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) | |||
| self._bn2 = nn.BatchNorm2d( | |||
| num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) | |||
| self._swish = MemoryEfficientSwish() | |||
| def forward(self, inputs, drop_connect_rate=None): | |||
| """MBConvBlock's forward function. | |||
| Args: | |||
| inputs (tensor): Input tensor. | |||
| drop_connect_rate (bool): Drop connect rate (float, between 0 and 1). | |||
| Returns: | |||
| Output of this block after processing. | |||
| """ | |||
| x = inputs | |||
| if self._block_args.expand_ratio != 1: | |||
| x = self._expand_conv(inputs) | |||
| x = self._bn0(x) | |||
| x = self._swish(x) | |||
| x = self._depthwise_conv(x) | |||
| x = self._bn1(x) | |||
| x = self._swish(x) | |||
| if self.has_se: | |||
| x_squeezed = F.adaptive_avg_pool2d(x, 1) | |||
| x_squeezed = self._se_reduce(x_squeezed) | |||
| x_squeezed = self._swish(x_squeezed) | |||
| x_squeezed = self._se_expand(x_squeezed) | |||
| x = torch.sigmoid(x_squeezed) * x | |||
| x = self._project_conv(x) | |||
| x = self._bn2(x) | |||
| input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters | |||
| if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: | |||
| if drop_connect_rate: | |||
| x = drop_connect( | |||
| x, p=drop_connect_rate, training=self.training) | |||
| x = x + inputs | |||
| return x | |||
| def set_swish(self, memory_efficient=True): | |||
| """Sets swish function as memory efficient (for training) or standard (for export). | |||
| Args: | |||
| memory_efficient (bool): Whether to use memory-efficient version of swish. | |||
| """ | |||
| self._swish = MemoryEfficientSwish() if memory_efficient else Swish() | |||
| class EfficientNet(nn.Module): | |||
| """EfficientNet model. | |||
| Most easily loaded with the .from_name or .from_pretrained methods. | |||
| Args: | |||
| blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks. | |||
| global_params (namedtuple): A set of GlobalParams shared between blocks. | |||
| References: | |||
| [1] https://arxiv.org/abs/1905.11946 (EfficientNet) | |||
| Example: | |||
| >>> import torch | |||
| >>> from efficientnet.model import EfficientNet | |||
| >>> inputs = torch.rand(1, 3, 224, 224) | |||
| >>> model = EfficientNet.from_pretrained('efficientnet-b0') | |||
| >>> model.eval() | |||
| >>> outputs = model(inputs) | |||
| """ | |||
| def __init__(self, blocks_args=None, global_params=None): | |||
| super().__init__() | |||
| assert isinstance(blocks_args, list), 'blocks_args should be a list' | |||
| assert len(blocks_args) > 0, 'block args must be greater than 0' | |||
| self._global_params = global_params | |||
| self._blocks_args = blocks_args | |||
| bn_mom = 1 - self._global_params.batch_norm_momentum | |||
| bn_eps = self._global_params.batch_norm_epsilon | |||
| image_size = global_params.image_size | |||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||
| in_channels = 3 | |||
| out_channels = round_filters(32, self._global_params) | |||
| self._conv_stem = Conv2d( | |||
| in_channels, out_channels, kernel_size=3, stride=2, bias=False) | |||
| self._bn0 = nn.BatchNorm2d( | |||
| num_features=out_channels, momentum=bn_mom, eps=bn_eps) | |||
| image_size = calculate_output_image_size(image_size, 2) | |||
| self._blocks = nn.ModuleList([]) | |||
| for block_args in self._blocks_args: | |||
| block_args = block_args._replace( | |||
| input_filters=round_filters(block_args.input_filters, | |||
| self._global_params), | |||
| output_filters=round_filters(block_args.output_filters, | |||
| self._global_params), | |||
| num_repeat=round_repeats(block_args.num_repeat, | |||
| self._global_params)) | |||
| self._blocks.append( | |||
| MBConvBlock( | |||
| block_args, self._global_params, image_size=image_size)) | |||
| image_size = calculate_output_image_size(image_size, | |||
| block_args.stride) | |||
| if block_args.num_repeat > 1: | |||
| block_args = block_args._replace( | |||
| input_filters=block_args.output_filters, stride=1) | |||
| for _ in range(block_args.num_repeat - 1): | |||
| self._blocks.append( | |||
| MBConvBlock( | |||
| block_args, self._global_params, | |||
| image_size=image_size)) | |||
| in_channels = block_args.output_filters | |||
| out_channels = round_filters(1280, self._global_params) | |||
| Conv2d = get_same_padding_conv2d(image_size=image_size) | |||
| self._conv_head = Conv2d( | |||
| in_channels, out_channels, kernel_size=1, bias=False) | |||
| self._bn1 = nn.BatchNorm2d( | |||
| num_features=out_channels, momentum=bn_mom, eps=bn_eps) | |||
| self._avg_pooling = nn.AdaptiveAvgPool2d(1) | |||
| if self._global_params.include_top: | |||
| self._dropout = nn.Dropout(self._global_params.dropout_rate) | |||
| self._fc = nn.Linear(out_channels, self._global_params.num_classes) | |||
| self._swish = MemoryEfficientSwish() | |||
| def set_swish(self, memory_efficient=True): | |||
| """Sets swish function as memory efficient (for training) or standard (for export). | |||
| Args: | |||
| memory_efficient (bool): Whether to use memory-efficient version of swish. | |||
| """ | |||
| self._swish = MemoryEfficientSwish() if memory_efficient else Swish() | |||
| for block in self._blocks: | |||
| block.set_swish(memory_efficient) | |||
| def extract_endpoints(self, inputs): | |||
| """Use convolution layer to extract features | |||
| from reduction levels i in [1, 2, 3, 4, 5]. | |||
| Args: | |||
| inputs (tensor): Input tensor. | |||
| Returns: | |||
| Dictionary of last intermediate features | |||
| with reduction levels i in [1, 2, 3, 4, 5]. | |||
| Example: | |||
| >>> import torch | |||
| >>> from efficientnet.model import EfficientNet | |||
| >>> inputs = torch.rand(1, 3, 224, 224) | |||
| >>> model = EfficientNet.from_pretrained('efficientnet-b0') | |||
| >>> endpoints = model.extract_endpoints(inputs) | |||
| >>> print(endpoints['reduction_1'].shape) # torch.Size([1, 16, 112, 112]) | |||
| >>> print(endpoints['reduction_2'].shape) # torch.Size([1, 24, 56, 56]) | |||
| >>> print(endpoints['reduction_3'].shape) # torch.Size([1, 40, 28, 28]) | |||
| >>> print(endpoints['reduction_4'].shape) # torch.Size([1, 112, 14, 14]) | |||
| >>> print(endpoints['reduction_5'].shape) # torch.Size([1, 320, 7, 7]) | |||
| >>> print(endpoints['reduction_6'].shape) # torch.Size([1, 1280, 7, 7]) | |||
| """ | |||
| endpoints = dict() | |||
| x = self._swish(self._bn0(self._conv_stem(inputs))) | |||
| prev_x = x | |||
| for idx, block in enumerate(self._blocks): | |||
| drop_connect_rate = self._global_params.drop_connect_rate | |||
| if drop_connect_rate: | |||
| drop_connect_rate *= float(idx) / len( | |||
| self._blocks) # scale drop connect_rate | |||
| x = block(x, drop_connect_rate=drop_connect_rate) | |||
| if prev_x.size(2) > x.size(2): | |||
| endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x | |||
| elif idx == len(self._blocks) - 1: | |||
| endpoints['reduction_{}'.format(len(endpoints) + 1)] = x | |||
| prev_x = x | |||
| x = self._swish(self._bn1(self._conv_head(x))) | |||
| endpoints['reduction_{}'.format(len(endpoints) + 1)] = x | |||
| return endpoints | |||
| def extract_features(self, inputs): | |||
| """use convolution layer to extract feature . | |||
| Args: | |||
| inputs (tensor): Input tensor. | |||
| Returns: | |||
| Output of the final convolution | |||
| layer in the efficientnet model. | |||
| """ | |||
| x = self._swish(self._bn0(self._conv_stem(inputs))) | |||
| for idx, block in enumerate(self._blocks): | |||
| drop_connect_rate = self._global_params.drop_connect_rate | |||
| if drop_connect_rate: | |||
| drop_connect_rate *= float(idx) / len(self._blocks) | |||
| x = block(x, drop_connect_rate=drop_connect_rate) | |||
| x = self._swish(self._bn1(self._conv_head(x))) | |||
| return x | |||
| def forward(self, inputs): | |||
| """EfficientNet's forward function. | |||
| Calls extract_features to extract features, applies final linear layer, and returns logits. | |||
| Args: | |||
| inputs (tensor): Input tensor. | |||
| Returns: | |||
| Output of this model after processing. | |||
| """ | |||
| x = self.extract_features(inputs) | |||
| x = self._avg_pooling(x) | |||
| if self._global_params.include_top: | |||
| x = x.flatten(start_dim=1) | |||
| x = self._dropout(x) | |||
| x = self._fc(x) | |||
| return x | |||
| @classmethod | |||
| def from_name(cls, model_name, in_channels=3, **override_params): | |||
| """Create an efficientnet model according to name. | |||
| Args: | |||
| model_name (str): Name for efficientnet. | |||
| in_channels (int): Input data's channel number. | |||
| override_params (other key word params): | |||
| Params to override model's global_params. | |||
| Optional key: | |||
| 'width_coefficient', 'depth_coefficient', | |||
| 'image_size', 'dropout_rate', | |||
| 'num_classes', 'batch_norm_momentum', | |||
| 'batch_norm_epsilon', 'drop_connect_rate', | |||
| 'depth_divisor', 'min_depth' | |||
| Returns: | |||
| An efficientnet model. | |||
| """ | |||
| cls._check_model_name_is_valid(model_name) | |||
| blocks_args, global_params = get_model_params(model_name, | |||
| override_params) | |||
| model = cls(blocks_args, global_params) | |||
| model._change_in_channels(in_channels) | |||
| return model | |||
| @classmethod | |||
| def from_pretrained(cls, | |||
| model_name, | |||
| weights_path=None, | |||
| advprop=False, | |||
| in_channels=3, | |||
| num_classes=1000, | |||
| **override_params): | |||
| """Create an efficientnet model according to name. | |||
| Args: | |||
| model_name (str): Name for efficientnet. | |||
| weights_path (None or str): | |||
| str: path to pretrained weights file on the local disk. | |||
| None: use pretrained weights downloaded from the Internet. | |||
| advprop (bool): | |||
| Whether to load pretrained weights | |||
| trained with advprop (valid when weights_path is None). | |||
| in_channels (int): Input data's channel number. | |||
| num_classes (int): | |||
| Number of categories for classification. | |||
| It controls the output size for final linear layer. | |||
| override_params (other key word params): | |||
| Params to override model's global_params. | |||
| Optional key: | |||
| 'width_coefficient', 'depth_coefficient', | |||
| 'image_size', 'dropout_rate', | |||
| 'batch_norm_momentum', | |||
| 'batch_norm_epsilon', 'drop_connect_rate', | |||
| 'depth_divisor', 'min_depth' | |||
| Returns: | |||
| A pretrained efficientnet model. | |||
| """ | |||
| model = cls.from_name( | |||
| model_name, num_classes=num_classes, **override_params) | |||
| model._change_in_channels(in_channels) | |||
| return model | |||
| @classmethod | |||
| def get_image_size(cls, model_name): | |||
| """Get the input image size for a given efficientnet model. | |||
| Args: | |||
| model_name (str): Name for efficientnet. | |||
| Returns: | |||
| Input image size (resolution). | |||
| """ | |||
| cls._check_model_name_is_valid(model_name) | |||
| _, _, res, _ = efficientnet_params(model_name) | |||
| return res | |||
| @classmethod | |||
| def _check_model_name_is_valid(cls, model_name): | |||
| """Validates model name. | |||
| Args: | |||
| model_name (str): Name for efficientnet. | |||
| Returns: | |||
| bool: Is a valid name or not. | |||
| """ | |||
| if model_name not in VALID_MODELS: | |||
| raise ValueError('model_name should be one of: ' | |||
| + ', '.join(VALID_MODELS)) | |||
| def _change_in_channels(self, in_channels): | |||
| """Adjust model's first convolution layer to in_channels, if in_channels not equals 3. | |||
| Args: | |||
| in_channels (int): Input data's channel number. | |||
| """ | |||
| if in_channels != 3: | |||
| Conv2d = get_same_padding_conv2d( | |||
| image_size=self._global_params.image_size) | |||
| out_channels = round_filters(32, self._global_params) | |||
| self._conv_stem = Conv2d( | |||
| in_channels, out_channels, kernel_size=3, stride=2, bias=False) | |||
| @@ -0,0 +1,559 @@ | |||
| # The implementation here is modified based on EfficientNet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch | |||
| import collections | |||
| import math | |||
| import re | |||
| from functools import partial | |||
| import torch | |||
| from torch import nn | |||
| from torch.nn import functional as F | |||
| from torch.utils import model_zoo | |||
| GlobalParams = collections.namedtuple('GlobalParams', [ | |||
| 'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate', | |||
| 'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon', | |||
| 'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top' | |||
| ]) | |||
| BlockArgs = collections.namedtuple('BlockArgs', [ | |||
| 'num_repeat', 'kernel_size', 'stride', 'expand_ratio', 'input_filters', | |||
| 'output_filters', 'se_ratio', 'id_skip' | |||
| ]) | |||
| GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields) | |||
| BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields) | |||
| if hasattr(nn, 'SiLU'): | |||
| Swish = nn.SiLU | |||
| else: | |||
| class Swish(nn.Module): | |||
| def forward(self, x): | |||
| return x * torch.sigmoid(x) | |||
| class SwishImplementation(torch.autograd.Function): | |||
| @staticmethod | |||
| def forward(ctx, i): | |||
| result = i * torch.sigmoid(i) | |||
| ctx.save_for_backward(i) | |||
| return result | |||
| @staticmethod | |||
| def backward(ctx, grad_output): | |||
| i = ctx.saved_tensors[0] | |||
| sigmoid_i = torch.sigmoid(i) | |||
| return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) | |||
| class MemoryEfficientSwish(nn.Module): | |||
| def forward(self, x): | |||
| return SwishImplementation.apply(x) | |||
| def round_filters(filters, global_params): | |||
| """Calculate and round number of filters based on width multiplier. | |||
| Use width_coefficient, depth_divisor and min_depth of global_params. | |||
| Args: | |||
| filters (int): Filters number to be calculated. | |||
| global_params (namedtuple): Global params of the model. | |||
| Returns: | |||
| new_filters: New filters number after calculating. | |||
| """ | |||
| multiplier = global_params.width_coefficient | |||
| if not multiplier: | |||
| return filters | |||
| divisor = global_params.depth_divisor | |||
| min_depth = global_params.min_depth | |||
| filters *= multiplier | |||
| min_depth = min_depth or divisor | |||
| new_filters = max(min_depth, | |||
| int(filters + divisor / 2) // divisor * divisor) | |||
| if new_filters < 0.9 * filters: | |||
| new_filters += divisor | |||
| return int(new_filters) | |||
| def round_repeats(repeats, global_params): | |||
| """Calculate module's repeat number of a block based on depth multiplier. | |||
| Use depth_coefficient of global_params. | |||
| Args: | |||
| repeats (int): num_repeat to be calculated. | |||
| global_params (namedtuple): Global params of the model. | |||
| Returns: | |||
| new repeat: New repeat number after calculating. | |||
| """ | |||
| multiplier = global_params.depth_coefficient | |||
| if not multiplier: | |||
| return repeats | |||
| return int(math.ceil(multiplier * repeats)) | |||
| def drop_connect(inputs, p, training): | |||
| """Drop connect. | |||
| Args: | |||
| input (tensor: BCWH): Input of this structure. | |||
| p (float: 0.0~1.0): Probability of drop connection. | |||
| training (bool): The running mode. | |||
| Returns: | |||
| output: Output after drop connection. | |||
| """ | |||
| assert 0 <= p <= 1, 'p must be in range of [0,1]' | |||
| if not training: | |||
| return inputs | |||
| batch_size = inputs.shape[0] | |||
| keep_prob = 1 - p | |||
| random_tensor = keep_prob | |||
| random_tensor += torch.rand([batch_size, 1, 1, 1], | |||
| dtype=inputs.dtype, | |||
| device=inputs.device) | |||
| binary_tensor = torch.floor(random_tensor) | |||
| output = inputs / keep_prob * binary_tensor | |||
| return output | |||
| def get_width_and_height_from_size(x): | |||
| """Obtain height and width from x. | |||
| Args: | |||
| x (int, tuple or list): Data size. | |||
| Returns: | |||
| size: A tuple or list (H,W). | |||
| """ | |||
| if isinstance(x, int): | |||
| return x, x | |||
| if isinstance(x, list) or isinstance(x, tuple): | |||
| return x | |||
| else: | |||
| raise TypeError() | |||
| def calculate_output_image_size(input_image_size, stride): | |||
| """Calculates the output image size when using Conv2dSamePadding with a stride. | |||
| Necessary for static padding. Thanks to mannatsingh for pointing this out. | |||
| Args: | |||
| input_image_size (int, tuple or list): Size of input image. | |||
| stride (int, tuple or list): Conv2d operation's stride. | |||
| Returns: | |||
| output_image_size: A list [H,W]. | |||
| """ | |||
| if input_image_size is None: | |||
| return None | |||
| image_height, image_width = get_width_and_height_from_size( | |||
| input_image_size) | |||
| stride = stride if isinstance(stride, int) else stride[0] | |||
| image_height = int(math.ceil(image_height / stride)) | |||
| image_width = int(math.ceil(image_width / stride)) | |||
| return [image_height, image_width] | |||
| def get_same_padding_conv2d(image_size=None): | |||
| """Chooses static padding if you have specified an image size, and dynamic padding otherwise. | |||
| Static padding is necessary for ONNX exporting of models. | |||
| Args: | |||
| image_size (int or tuple): Size of the image. | |||
| Returns: | |||
| Conv2dDynamicSamePadding or Conv2dStaticSamePadding. | |||
| """ | |||
| if image_size is None: | |||
| return Conv2dDynamicSamePadding | |||
| else: | |||
| return partial(Conv2dStaticSamePadding, image_size=image_size) | |||
| class Conv2dDynamicSamePadding(nn.Conv2d): | |||
| """2D Convolutions like TensorFlow, for a dynamic image size. | |||
| The padding is operated in forward function by calculating dynamically. | |||
| """ | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| dilation=1, | |||
| groups=1, | |||
| bias=True): | |||
| super().__init__(in_channels, out_channels, kernel_size, stride, 0, | |||
| dilation, groups, bias) | |||
| self.stride = self.stride if len( | |||
| self.stride) == 2 else [self.stride[0]] * 2 | |||
| def forward(self, x): | |||
| ih, iw = x.size()[-2:] | |||
| kh, kw = self.weight.size()[-2:] | |||
| sh, sw = self.stride | |||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||
| a1 = (oh - 1) * self.stride[0] | |||
| pad_h = max(a1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||
| a2 = (ow - 1) * self.stride[1] | |||
| pad_w = max(a2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||
| if pad_h > 0 or pad_w > 0: | |||
| x = F.pad(x, [ | |||
| pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 | |||
| ]) | |||
| return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, | |||
| self.dilation, self.groups) | |||
| class Conv2dStaticSamePadding(nn.Conv2d): | |||
| """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size. | |||
| The padding mudule is calculated in construction function, then used in forward. | |||
| """ | |||
| def __init__(self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| image_size=None, | |||
| **kwargs): | |||
| super().__init__(in_channels, out_channels, kernel_size, stride, | |||
| **kwargs) | |||
| self.stride = self.stride if len( | |||
| self.stride) == 2 else [self.stride[0]] * 2 | |||
| assert image_size is not None | |||
| ih, iw = (image_size, | |||
| image_size) if isinstance(image_size, int) else image_size | |||
| kh, kw = self.weight.size()[-2:] | |||
| sh, sw = self.stride | |||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||
| b1 = (oh - 1) * self.stride[0] | |||
| pad_h = max(b1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||
| b2 = (ow - 1) * self.stride[1] | |||
| pad_w = max(b2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||
| if pad_h > 0 or pad_w > 0: | |||
| self.static_padding = nn.ZeroPad2d( | |||
| (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, | |||
| pad_h - pad_h // 2)) | |||
| else: | |||
| self.static_padding = nn.Identity() | |||
| def forward(self, x): | |||
| x = self.static_padding(x) | |||
| x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, | |||
| self.dilation, self.groups) | |||
| return x | |||
| def get_same_padding_maxPool2d(image_size=None): | |||
| """Chooses static padding if you have specified an image size, and dynamic padding otherwise. | |||
| Static padding is necessary for ONNX exporting of models. | |||
| Args: | |||
| image_size (int or tuple): Size of the image. | |||
| Returns: | |||
| MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding. | |||
| """ | |||
| if image_size is None: | |||
| return MaxPool2dDynamicSamePadding | |||
| else: | |||
| return partial(MaxPool2dStaticSamePadding, image_size=image_size) | |||
| class MaxPool2dDynamicSamePadding(nn.MaxPool2d): | |||
| """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size. | |||
| The padding is operated in forward function by calculating dynamically. | |||
| """ | |||
| def __init__(self, | |||
| kernel_size, | |||
| stride, | |||
| padding=0, | |||
| dilation=1, | |||
| return_indices=False, | |||
| ceil_mode=False): | |||
| super().__init__(kernel_size, stride, padding, dilation, | |||
| return_indices, ceil_mode) | |||
| self.stride = [self.stride] * 2 if isinstance(self.stride, | |||
| int) else self.stride | |||
| self.kernel_size = [self.kernel_size] * 2 if isinstance( | |||
| self.kernel_size, int) else self.kernel_size | |||
| self.dilation = [self.dilation] * 2 if isinstance( | |||
| self.dilation, int) else self.dilation | |||
| def forward(self, x): | |||
| ih, iw = x.size()[-2:] | |||
| kh, kw = self.kernel_size | |||
| sh, sw = self.stride | |||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||
| c1 = (oh - 1) * self.stride[0] | |||
| pad_h = max(c1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||
| c2 = (ow - 1) * self.stride[1] | |||
| pad_w = max(c2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||
| if pad_h > 0 or pad_w > 0: | |||
| x = F.pad(x, [ | |||
| pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 | |||
| ]) | |||
| return F.max_pool2d(x, self.kernel_size, self.stride, self.padding, | |||
| self.dilation, self.ceil_mode, self.return_indices) | |||
| class MaxPool2dStaticSamePadding(nn.MaxPool2d): | |||
| """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size. | |||
| The padding mudule is calculated in construction function, then used in forward. | |||
| """ | |||
| def __init__(self, kernel_size, stride, image_size=None, **kwargs): | |||
| super().__init__(kernel_size, stride, **kwargs) | |||
| self.stride = [self.stride] * 2 if isinstance(self.stride, | |||
| int) else self.stride | |||
| self.kernel_size = [self.kernel_size] * 2 if isinstance( | |||
| self.kernel_size, int) else self.kernel_size | |||
| self.dilation = [self.dilation] * 2 if isinstance( | |||
| self.dilation, int) else self.dilation | |||
| assert image_size is not None | |||
| ih, iw = (image_size, | |||
| image_size) if isinstance(image_size, int) else image_size | |||
| kh, kw = self.kernel_size | |||
| sh, sw = self.stride | |||
| oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) | |||
| d1 = (oh - 1) * self.stride[0] | |||
| pad_h = max(d1 + (kh - 1) * self.dilation[0] + 1 - ih, 0) | |||
| d2 = (ow - 1) * self.stride[1] | |||
| pad_w = max(d2 + (kw - 1) * self.dilation[1] + 1 - iw, 0) | |||
| if pad_h > 0 or pad_w > 0: | |||
| self.static_padding = nn.ZeroPad2d( | |||
| (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, | |||
| pad_h - pad_h // 2)) | |||
| else: | |||
| self.static_padding = nn.Identity() | |||
| def forward(self, x): | |||
| x = self.static_padding(x) | |||
| x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding, | |||
| self.dilation, self.ceil_mode, self.return_indices) | |||
| return x | |||
| class BlockDecoder(object): | |||
| """Block Decoder for readability, | |||
| straight from the official TensorFlow repository. | |||
| """ | |||
| @staticmethod | |||
| def _decode_block_string(block_string): | |||
| """Get a block through a string notation of arguments. | |||
| Args: | |||
| block_string (str): A string notation of arguments. | |||
| Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'. | |||
| Returns: | |||
| BlockArgs: The namedtuple defined at the top of this file. | |||
| """ | |||
| assert isinstance(block_string, str) | |||
| ops = block_string.split('_') | |||
| options = {} | |||
| for op in ops: | |||
| splits = re.split(r'(\d.*)', op) | |||
| if len(splits) >= 2: | |||
| key, value = splits[:2] | |||
| options[key] = value | |||
| # Check stride | |||
| assert (('s' in options and len(options['s']) == 1) | |||
| or (len(options['s']) == 2 | |||
| and options['s'][0] == options['s'][1])) | |||
| return BlockArgs( | |||
| num_repeat=int(options['r']), | |||
| kernel_size=int(options['k']), | |||
| stride=[int(options['s'][0])], | |||
| expand_ratio=int(options['e']), | |||
| input_filters=int(options['i']), | |||
| output_filters=int(options['o']), | |||
| se_ratio=float(options['se']) if 'se' in options else None, | |||
| id_skip=('noskip' not in block_string)) | |||
| @staticmethod | |||
| def _encode_block_string(block): | |||
| """Encode a block to a string. | |||
| Args: | |||
| block (namedtuple): A BlockArgs type argument. | |||
| Returns: | |||
| block_string: A String form of BlockArgs. | |||
| """ | |||
| args = [ | |||
| 'r%d' % block.num_repeat, | |||
| 'k%d' % block.kernel_size, | |||
| 's%d%d' % (block.strides[0], block.strides[1]), | |||
| 'e%s' % block.expand_ratio, | |||
| 'i%d' % block.input_filters, | |||
| 'o%d' % block.output_filters | |||
| ] | |||
| if 0 < block.se_ratio <= 1: | |||
| args.append('se%s' % block.se_ratio) | |||
| if block.id_skip is False: | |||
| args.append('noskip') | |||
| return '_'.join(args) | |||
| @staticmethod | |||
| def decode(string_list): | |||
| """Decode a list of string notations to specify blocks inside the network. | |||
| Args: | |||
| string_list (list[str]): A list of strings, each string is a notation of block. | |||
| Returns: | |||
| blocks_args: A list of BlockArgs namedtuples of block args. | |||
| """ | |||
| assert isinstance(string_list, list) | |||
| blocks_args = [] | |||
| for block_string in string_list: | |||
| blocks_args.append(BlockDecoder._decode_block_string(block_string)) | |||
| return blocks_args | |||
| @staticmethod | |||
| def encode(blocks_args): | |||
| """Encode a list of BlockArgs to a list of strings. | |||
| Args: | |||
| blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args. | |||
| Returns: | |||
| block_strings: A list of strings, each string is a notation of block. | |||
| """ | |||
| block_strings = [] | |||
| for block in blocks_args: | |||
| block_strings.append(BlockDecoder._encode_block_string(block)) | |||
| return block_strings | |||
| def efficientnet_params(model_name): | |||
| """Map EfficientNet model name to parameter coefficients. | |||
| Args: | |||
| model_name (str): Model name to be queried. | |||
| Returns: | |||
| params_dict[model_name]: A (width,depth,res,dropout) tuple. | |||
| """ | |||
| params_dict = { | |||
| 'efficientnet-b0': (1.0, 1.0, 112, 0.2), | |||
| 'efficientnet-b1': (1.0, 1.1, 240, 0.2), | |||
| 'efficientnet-b2': (1.1, 1.2, 260, 0.3), | |||
| 'efficientnet-b3': (1.2, 1.4, 300, 0.3), | |||
| 'efficientnet-b4': (1.4, 1.8, 380, 0.4), | |||
| 'efficientnet-b5': (1.6, 2.2, 456, 0.4), | |||
| 'efficientnet-b6': (1.8, 2.6, 528, 0.5), | |||
| 'efficientnet-b7': (2.0, 3.1, 600, 0.5), | |||
| 'efficientnet-b8': (2.2, 3.6, 672, 0.5), | |||
| 'efficientnet-l2': (4.3, 5.3, 800, 0.5), | |||
| } | |||
| return params_dict[model_name] | |||
| def efficientnet(width_coefficient=None, | |||
| depth_coefficient=None, | |||
| image_size=None, | |||
| dropout_rate=0.2, | |||
| drop_connect_rate=0.2, | |||
| num_classes=1000, | |||
| include_top=True): | |||
| """Create BlockArgs and GlobalParams for efficientnet model. | |||
| Args: | |||
| width_coefficient (float) | |||
| depth_coefficient (float) | |||
| image_size (int) | |||
| dropout_rate (float) | |||
| drop_connect_rate (float) | |||
| num_classes (int) | |||
| Meaning as the name suggests. | |||
| Returns: | |||
| blocks_args, global_params. | |||
| """ | |||
| blocks_args = [ | |||
| 'r1_k3_s11_e1_i32_o16_se0.25', | |||
| 'r2_k3_s22_e6_i16_o24_se0.25', | |||
| 'r2_k5_s22_e6_i24_o40_se0.25', | |||
| 'r3_k3_s22_e6_i40_o80_se0.25', | |||
| 'r3_k5_s11_e6_i80_o112_se0.25', | |||
| 'r4_k5_s22_e6_i112_o192_se0.25', | |||
| 'r1_k3_s11_e6_i192_o320_se0.25', | |||
| ] | |||
| blocks_args = BlockDecoder.decode(blocks_args) | |||
| global_params = GlobalParams( | |||
| width_coefficient=width_coefficient, | |||
| depth_coefficient=depth_coefficient, | |||
| image_size=image_size, | |||
| dropout_rate=dropout_rate, | |||
| num_classes=num_classes, | |||
| batch_norm_momentum=0.99, | |||
| batch_norm_epsilon=1e-3, | |||
| drop_connect_rate=drop_connect_rate, | |||
| depth_divisor=8, | |||
| min_depth=None, | |||
| include_top=include_top, | |||
| ) | |||
| return blocks_args, global_params | |||
| def get_model_params(model_name, override_params): | |||
| """Get the block args and global params for a given model name. | |||
| Args: | |||
| model_name (str): Model's name. | |||
| override_params (dict): A dict to modify global_params. | |||
| Returns: | |||
| blocks_args, global_params | |||
| """ | |||
| if model_name.startswith('efficientnet'): | |||
| w, d, s, p = efficientnet_params(model_name) | |||
| blocks_args, global_params = efficientnet( | |||
| width_coefficient=w, | |||
| depth_coefficient=d, | |||
| dropout_rate=p, | |||
| image_size=s) | |||
| else: | |||
| raise NotImplementedError( | |||
| 'model name is not pre-defined: {}'.format(model_name)) | |||
| if override_params: | |||
| global_params = global_params._replace(**override_params) | |||
| return blocks_args, global_params | |||
| def load_pretrained_weights(model, | |||
| model_name, | |||
| weights_path=None, | |||
| load_fc=True, | |||
| advprop=False, | |||
| verbose=True): | |||
| """Loads pretrained weights from weights path or download using url. | |||
| Args: | |||
| model (Module): The whole model of efficientnet. | |||
| model_name (str): Model name of efficientnet. | |||
| weights_path (None or str): | |||
| str: path to pretrained weights file on the local disk. | |||
| None: use pretrained weights downloaded from the Internet. | |||
| load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model. | |||
| advprop (bool): Whether to load pretrained weights | |||
| trained with advprop (valid when weights_path is None). | |||
| """ | |||
| if isinstance(weights_path, str): | |||
| state_dict = torch.load(weights_path) | |||
| else: | |||
| url_map_ = url_map_advprop if advprop else url_map | |||
| state_dict = model_zoo.load_url(url_map_[model_name]) | |||
| if load_fc: | |||
| ret = model.load_state_dict(state_dict, strict=False) | |||
| assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format( | |||
| ret.missing_keys) | |||
| else: | |||
| state_dict.pop('_fc.weight') | |||
| state_dict.pop('_fc.bias') | |||
| ret = model.load_state_dict(state_dict, strict=False) | |||
| assert set(ret.missing_keys) == set([ | |||
| '_fc.weight', '_fc.bias' | |||
| ]), 'Missing keys when loading pretrained weights: {}'.format( | |||
| ret.missing_keys) | |||
| assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format( | |||
| ret.unexpected_keys) | |||
| if verbose: | |||
| print('Loaded pretrained weights for {}'.format(model_name)) | |||
| @@ -0,0 +1,67 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import torch | |||
| from PIL import Image | |||
| from torch import nn | |||
| from torchvision import transforms | |||
| from modelscope.utils.logger import get_logger | |||
| from .face_alignment.face_align import face_detection_PIL_v2 | |||
| logger = get_logger() | |||
| def transform_PIL(img_pil): | |||
| val_transforms = transforms.Compose([ | |||
| transforms.ToTensor(), | |||
| transforms.Normalize( | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| ]) | |||
| return val_transforms(img_pil) | |||
| index2AU = [1, 2, 4, 6, 7, 10, 12, 15, 23, 24, 25, 26] | |||
| emotion_list = [ | |||
| 'Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise' | |||
| ] | |||
| def inference(image_path, model, face_model, score_thre=0.5, GPU=0): | |||
| image = Image.open(image_path).convert('RGB') | |||
| face, bbox = face_detection_PIL_v2(image, face_model) | |||
| if bbox is None: | |||
| logger.warn('no face detected!') | |||
| result = {'emotion_result': None, 'box': None} | |||
| return result | |||
| face = transform_PIL(face) | |||
| face = face.unsqueeze(0) | |||
| if torch.cuda.is_available(): | |||
| face = face.cuda(GPU) | |||
| logits_AU, logits_emotion = model(face) | |||
| logits_AU = torch.sigmoid(logits_AU) | |||
| logits_emotion = nn.functional.softmax(logits_emotion, 1) | |||
| _, index_list = logits_emotion.max(1) | |||
| emotion_index = index_list[0].data.item() | |||
| prob = logits_emotion[0][emotion_index] | |||
| if prob > score_thre and emotion_index != 3: | |||
| cur_emotion = emotion_list[emotion_index] | |||
| else: | |||
| cur_emotion = 'Neutral' | |||
| logits_AU = logits_AU[0] | |||
| au_ouput = torch.zeros_like(logits_AU) | |||
| au_ouput[logits_AU >= score_thre] = 1 | |||
| au_ouput[logits_AU < score_thre] = 0 | |||
| au_ouput = au_ouput.int() | |||
| cur_au_list = [] | |||
| for idx in range(au_ouput.shape[0]): | |||
| if au_ouput[idx] == 1: | |||
| au = index2AU[idx] | |||
| cur_au_list.append(au) | |||
| cur_au_list.sort() | |||
| result = (cur_emotion, bbox) | |||
| return result | |||
| @@ -0,0 +1,96 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import os | |||
| import sys | |||
| import torch | |||
| import torch.nn.functional as F | |||
| from torch import nn | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.models.cv.face_emotion.efficient import EfficientNet | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| logger = get_logger() | |||
| @MODELS.register_module(Tasks.face_emotion, module_name=Models.face_emotion) | |||
| class EfficientNetForFaceEmotion(TorchModel): | |||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||
| super().__init__( | |||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||
| self.model = FaceEmotionModel( | |||
| name='efficientnet-b0', num_embed=512, num_au=12, num_emotion=7) | |||
| if torch.cuda.is_available(): | |||
| self.device = 'cuda' | |||
| logger.info('Use GPU') | |||
| else: | |||
| self.device = 'cpu' | |||
| logger.info('Use CPU') | |||
| pretrained_params = torch.load( | |||
| '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), | |||
| map_location=self.device) | |||
| state_dict = pretrained_params['model'] | |||
| new_state = {} | |||
| for k, v in state_dict.items(): | |||
| if k.startswith('module.'): | |||
| k = k[7:] | |||
| new_state[k] = v | |||
| self.model.load_state_dict(new_state) | |||
| self.model.eval() | |||
| self.model.to(self.device) | |||
| def forward(self, x): | |||
| logits_au, logits_emotion = self.model(x) | |||
| return logits_au, logits_emotion | |||
| class FaceEmotionModel(nn.Module): | |||
| def __init__(self, | |||
| name='efficientnet-b0', | |||
| num_embed=512, | |||
| num_au=12, | |||
| num_emotion=7): | |||
| super(FaceEmotionModel, self).__init__() | |||
| self.backbone = EfficientNet.from_pretrained( | |||
| name, weights_path=None, advprop=True) | |||
| self.average_pool = nn.AdaptiveAvgPool2d(1) | |||
| self.embed = nn.Linear(self.backbone._fc.weight.data.shape[1], | |||
| num_embed) | |||
| self.features = nn.BatchNorm1d(num_embed) | |||
| nn.init.constant_(self.features.weight, 1.0) | |||
| self.features.weight.requires_grad = False | |||
| self.fc_au = nn.Sequential( | |||
| nn.Dropout(0.6), | |||
| nn.Linear(num_embed, num_au), | |||
| ) | |||
| self.fc_emotion = nn.Sequential( | |||
| nn.Dropout(0.6), | |||
| nn.Linear(num_embed, num_emotion), | |||
| ) | |||
| def feat_single_img(self, x): | |||
| x = self.backbone.extract_features(x) | |||
| x = self.average_pool(x) | |||
| x = x.flatten(1) | |||
| x = self.embed(x) | |||
| x = self.features(x) | |||
| return x | |||
| def forward(self, x): | |||
| x = self.feat_single_img(x) | |||
| logits_au = self.fc_au(x) | |||
| att_au = torch.sigmoid(logits_au).unsqueeze(-1) | |||
| x = x.unsqueeze(1) | |||
| emotion_vec_list = torch.matmul(att_au, x) | |||
| emotion_vec = emotion_vec_list.sum(1) | |||
| logits_emotion = self.fc_emotion(emotion_vec) | |||
| return logits_au, logits_emotion | |||
| @@ -0,0 +1,79 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import os | |||
| import cv2 | |||
| import numpy as np | |||
| import tensorflow as tf | |||
| def init(mod): | |||
| PATH_TO_CKPT = mod | |||
| net = tf.Graph() | |||
| with net.as_default(): | |||
| od_graph_def = tf.GraphDef() | |||
| config = tf.ConfigProto() | |||
| config.gpu_options.per_process_gpu_memory_fraction = 0.6 | |||
| with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: | |||
| serialized_graph = fid.read() | |||
| od_graph_def.ParseFromString(serialized_graph) | |||
| tf.import_graph_def(od_graph_def, name='') | |||
| sess = tf.Session(graph=net, config=config) | |||
| return sess, net | |||
| def filter_bboxes_confs(shape, | |||
| imgsBboxes, | |||
| imgsConfs, | |||
| single=False, | |||
| thresh=0.5): | |||
| [w, h] = shape | |||
| if single: | |||
| bboxes, confs = [], [] | |||
| for y in range(len(imgsBboxes)): | |||
| if imgsConfs[y] >= thresh: | |||
| [x1, y1, x2, y2] = list(imgsBboxes[y]) | |||
| x1, y1, x2, y2 = int(w * x1), int(h * y1), int(w * x2), int( | |||
| h * y2) | |||
| bboxes.append([y1, x1, y2, x2]) | |||
| confs.append(imgsConfs[y]) | |||
| return bboxes, confs | |||
| else: | |||
| retImgsBboxes, retImgsConfs = [], [] | |||
| for x in range(len(imgsBboxes)): | |||
| bboxes, confs = [], [] | |||
| for y in range(len(imgsBboxes[x])): | |||
| if imgsConfs[x][y] >= thresh: | |||
| [x1, y1, x2, y2] = list(imgsBboxes[x][y]) | |||
| x1, y1, x2, y2 = int(w * x1), int(h * y1), int( | |||
| w * x2), int(h * y2) | |||
| bboxes.append([y1, x1, y2, x2]) | |||
| confs.append(imgsConfs[x][y]) | |||
| retImgsBboxes.append(bboxes) | |||
| retImgsConfs.append(confs) | |||
| return retImgsBboxes, retImgsConfs | |||
| def detect(im, sess, net): | |||
| image_np = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) | |||
| image_np_expanded = np.expand_dims(image_np, axis=0) | |||
| image_tensor = net.get_tensor_by_name('image_tensor:0') | |||
| bboxes = net.get_tensor_by_name('detection_boxes:0') | |||
| dConfs = net.get_tensor_by_name('detection_scores:0') | |||
| classes = net.get_tensor_by_name('detection_classes:0') | |||
| num_detections = net.get_tensor_by_name('num_detections:0') | |||
| (bboxes, dConfs, classes, | |||
| num_detections) = sess.run([bboxes, dConfs, classes, num_detections], | |||
| feed_dict={image_tensor: image_np_expanded}) | |||
| w, h, _ = im.shape | |||
| bboxes, confs = filter_bboxes_confs([w, h], bboxes[0], dConfs[0], True) | |||
| return bboxes, confs | |||
| class FaceDetector: | |||
| def __init__(self, mod): | |||
| self.sess, self.net = init(mod) | |||
| def do_detect(self, im): | |||
| bboxes, confs = detect(im, self.sess, self.net) | |||
| return bboxes, confs | |||
| @@ -0,0 +1,59 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import os | |||
| import sys | |||
| import cv2 | |||
| import numpy as np | |||
| from PIL import Image, ImageFile | |||
| from .face import FaceDetector | |||
| ImageFile.LOAD_TRUNCATED_IMAGES = True | |||
| def adjust_bx_v2(box, w, h): | |||
| x1, y1, x2, y2 = box[0], box[1], box[2], box[3] | |||
| box_w = x2 - x1 | |||
| box_h = y2 - y1 | |||
| delta = abs(box_w - box_h) | |||
| if box_w > box_h: | |||
| if y1 >= delta: | |||
| y1 = y1 - delta | |||
| else: | |||
| delta_y1 = y1 | |||
| y1 = 0 | |||
| delta_y2 = delta - delta_y1 | |||
| y2 = y2 + delta_y2 if y2 < h - delta_y2 else h - 1 | |||
| else: | |||
| if x1 >= delta / 2 and x2 <= w - delta / 2: | |||
| x1 = x1 - delta / 2 | |||
| x2 = x2 + delta / 2 | |||
| elif x1 < delta / 2 and x2 <= w - delta / 2: | |||
| delta_x1 = x1 | |||
| x1 = 0 | |||
| delta_x2 = delta - delta_x1 | |||
| x2 = x2 + delta_x2 if x2 < w - delta_x2 else w - 1 | |||
| elif x1 >= delta / 2 and x2 > w - delta / 2: | |||
| delta_x2 = w - x2 | |||
| x2 = w - 1 | |||
| delta_x1 = delta - x1 | |||
| x1 = x1 - delta_x1 if x1 >= delta_x1 else 0 | |||
| x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) | |||
| return [x1, y1, x2, y2] | |||
| def face_detection_PIL_v2(image, face_model): | |||
| crop_size = 112 | |||
| face_detector = FaceDetector(face_model) | |||
| img = np.array(image) | |||
| h, w = img.shape[0:2] | |||
| bxs, conf = face_detector.do_detect(img) | |||
| bx = bxs[0] | |||
| bx = adjust_bx_v2(bx, w, h) | |||
| x1, y1, x2, y2 = bx | |||
| image = img[y1:y2, x1:x2, :] | |||
| img = Image.fromarray(image) | |||
| img = img.resize((crop_size, crop_size)) | |||
| bx = tuple(bx) | |||
| return img, bx | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License | |||
| # at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/conv2d_gradfix.py | |||
| import contextlib | |||
| import warnings | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License | |||
| # t https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py | |||
| import os | |||
| import torch | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License | |||
| # at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py | |||
| import os | |||
| from collections import abc | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from stylegan2-pytorch, | |||
| # made public available under the MIT License at https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py | |||
| import functools | |||
| import math | |||
| import operator | |||
| @@ -0,0 +1,20 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .det_infer import NanoDetForFaceHumanHandDetection | |||
| else: | |||
| _import_structure = {'det_infer': ['NanoDetForFaceHumanHandDetection']} | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1,133 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from .one_stage_detector import OneStageDetector | |||
| logger = get_logger() | |||
| def load_model_weight(model_dir, device): | |||
| checkpoint = torch.load( | |||
| '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), | |||
| map_location=device) | |||
| state_dict = checkpoint['state_dict'].copy() | |||
| for k in checkpoint['state_dict']: | |||
| if k.startswith('avg_model.'): | |||
| v = state_dict.pop(k) | |||
| state_dict[k[4:]] = v | |||
| return state_dict | |||
| @MODELS.register_module( | |||
| Tasks.face_human_hand_detection, | |||
| module_name=Models.face_human_hand_detection) | |||
| class NanoDetForFaceHumanHandDetection(TorchModel): | |||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||
| super().__init__( | |||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||
| self.model = OneStageDetector() | |||
| if torch.cuda.is_available(): | |||
| self.device = 'cuda' | |||
| logger.info('Use GPU ') | |||
| else: | |||
| self.device = 'cpu' | |||
| logger.info('Use CPU') | |||
| self.state_dict = load_model_weight(model_dir, self.device) | |||
| self.model.load_state_dict(self.state_dict, strict=False) | |||
| self.model.eval() | |||
| self.model.to(self.device) | |||
| def forward(self, x): | |||
| pred_result = self.model.inference(x) | |||
| return pred_result | |||
| def naive_collate(batch): | |||
| elem = batch[0] | |||
| if isinstance(elem, dict): | |||
| return {key: naive_collate([d[key] for d in batch]) for key in elem} | |||
| else: | |||
| return batch | |||
| def get_resize_matrix(raw_shape, dst_shape): | |||
| r_w, r_h = raw_shape | |||
| d_w, d_h = dst_shape | |||
| Rs = np.eye(3) | |||
| Rs[0, 0] *= d_w / r_w | |||
| Rs[1, 1] *= d_h / r_h | |||
| return Rs | |||
| def color_aug_and_norm(meta, mean, std): | |||
| img = meta['img'].astype(np.float32) / 255 | |||
| mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3) / 255 | |||
| std = np.array(std, dtype=np.float32).reshape(1, 1, 3) / 255 | |||
| img = (img - mean) / std | |||
| meta['img'] = img | |||
| return meta | |||
| def img_process(meta, mean, std): | |||
| raw_img = meta['img'] | |||
| height = raw_img.shape[0] | |||
| width = raw_img.shape[1] | |||
| dst_shape = [320, 320] | |||
| M = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) | |||
| ResizeM = get_resize_matrix((width, height), dst_shape) | |||
| M = ResizeM @ M | |||
| img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape)) | |||
| meta['img'] = img | |||
| meta['warp_matrix'] = M | |||
| meta = color_aug_and_norm(meta, mean, std) | |||
| return meta | |||
| def overlay_bbox_cv(dets, class_names, score_thresh): | |||
| all_box = [] | |||
| for label in dets: | |||
| for bbox in dets[label]: | |||
| score = bbox[-1] | |||
| if score > score_thresh: | |||
| x0, y0, x1, y1 = [int(i) for i in bbox[:4]] | |||
| all_box.append([label, x0, y0, x1, y1, score]) | |||
| all_box.sort(key=lambda v: v[5]) | |||
| return all_box | |||
| mean = [103.53, 116.28, 123.675] | |||
| std = [57.375, 57.12, 58.395] | |||
| class_names = ['person', 'face', 'hand'] | |||
| def inference(model, device, img_path): | |||
| img_info = {'id': 0} | |||
| img = cv2.imread(img_path) | |||
| height, width = img.shape[:2] | |||
| img_info['height'] = height | |||
| img_info['width'] = width | |||
| meta = dict(img_info=img_info, raw_img=img, img=img) | |||
| meta = img_process(meta, mean, std) | |||
| meta['img'] = torch.from_numpy(meta['img'].transpose(2, 0, 1)).to(device) | |||
| meta = naive_collate([meta]) | |||
| meta['img'] = (meta['img'][0]).reshape(1, 3, 320, 320) | |||
| with torch.no_grad(): | |||
| res = model(meta) | |||
| result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35) | |||
| return result | |||
| @@ -0,0 +1,395 @@ | |||
| # The implementation here is modified based on nanodet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||
| import math | |||
| import torch | |||
| import torch.nn as nn | |||
| from .utils import ConvModule, DepthwiseConvModule, act_layers | |||
| def _make_divisible(v, divisor, min_value=None): | |||
| if min_value is None: | |||
| min_value = divisor | |||
| new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) | |||
| # Make sure that round down does not go down by more than 10%. | |||
| if new_v < 0.9 * v: | |||
| new_v += divisor | |||
| return new_v | |||
| def hard_sigmoid(x, inplace: bool = False): | |||
| if inplace: | |||
| return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0) | |||
| else: | |||
| return F.relu6(x + 3.0) / 6.0 | |||
| class SqueezeExcite(nn.Module): | |||
| def __init__(self, | |||
| in_chs, | |||
| se_ratio=0.25, | |||
| reduced_base_chs=None, | |||
| activation='ReLU', | |||
| gate_fn=hard_sigmoid, | |||
| divisor=4, | |||
| **_): | |||
| super(SqueezeExcite, self).__init__() | |||
| self.gate_fn = gate_fn | |||
| reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, | |||
| divisor) | |||
| self.avg_pool = nn.AdaptiveAvgPool2d(1) | |||
| self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True) | |||
| self.act1 = act_layers(activation) | |||
| self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True) | |||
| def forward(self, x): | |||
| x_se = self.avg_pool(x) | |||
| x_se = self.conv_reduce(x_se) | |||
| x_se = self.act1(x_se) | |||
| x_se = self.conv_expand(x_se) | |||
| x = x * self.gate_fn(x_se) | |||
| return x | |||
| class GhostModule(nn.Module): | |||
| def __init__(self, | |||
| inp, | |||
| oup, | |||
| kernel_size=1, | |||
| ratio=2, | |||
| dw_size=3, | |||
| stride=1, | |||
| activation='ReLU'): | |||
| super(GhostModule, self).__init__() | |||
| self.oup = oup | |||
| init_channels = math.ceil(oup / ratio) | |||
| new_channels = init_channels * (ratio - 1) | |||
| self.primary_conv = nn.Sequential( | |||
| nn.Conv2d( | |||
| inp, | |||
| init_channels, | |||
| kernel_size, | |||
| stride, | |||
| kernel_size // 2, | |||
| bias=False), | |||
| nn.BatchNorm2d(init_channels), | |||
| act_layers(activation) if activation else nn.Sequential(), | |||
| ) | |||
| self.cheap_operation = nn.Sequential( | |||
| nn.Conv2d( | |||
| init_channels, | |||
| new_channels, | |||
| dw_size, | |||
| 1, | |||
| dw_size // 2, | |||
| groups=init_channels, | |||
| bias=False, | |||
| ), | |||
| nn.BatchNorm2d(new_channels), | |||
| act_layers(activation) if activation else nn.Sequential(), | |||
| ) | |||
| def forward(self, x): | |||
| x1 = self.primary_conv(x) | |||
| x2 = self.cheap_operation(x1) | |||
| out = torch.cat([x1, x2], dim=1) | |||
| return out | |||
| class GhostBottleneck(nn.Module): | |||
| """Ghost bottleneck w/ optional SE""" | |||
| def __init__( | |||
| self, | |||
| in_chs, | |||
| mid_chs, | |||
| out_chs, | |||
| dw_kernel_size=3, | |||
| stride=1, | |||
| activation='ReLU', | |||
| se_ratio=0.0, | |||
| ): | |||
| super(GhostBottleneck, self).__init__() | |||
| has_se = se_ratio is not None and se_ratio > 0.0 | |||
| self.stride = stride | |||
| # Point-wise expansion | |||
| self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation) | |||
| # Depth-wise convolution | |||
| if self.stride > 1: | |||
| self.conv_dw = nn.Conv2d( | |||
| mid_chs, | |||
| mid_chs, | |||
| dw_kernel_size, | |||
| stride=stride, | |||
| padding=(dw_kernel_size - 1) // 2, | |||
| groups=mid_chs, | |||
| bias=False, | |||
| ) | |||
| self.bn_dw = nn.BatchNorm2d(mid_chs) | |||
| if has_se: | |||
| self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio) | |||
| else: | |||
| self.se = None | |||
| self.ghost2 = GhostModule(mid_chs, out_chs, activation=None) | |||
| if in_chs == out_chs and self.stride == 1: | |||
| self.shortcut = nn.Sequential() | |||
| else: | |||
| self.shortcut = nn.Sequential( | |||
| nn.Conv2d( | |||
| in_chs, | |||
| in_chs, | |||
| dw_kernel_size, | |||
| stride=stride, | |||
| padding=(dw_kernel_size - 1) // 2, | |||
| groups=in_chs, | |||
| bias=False, | |||
| ), | |||
| nn.BatchNorm2d(in_chs), | |||
| nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False), | |||
| nn.BatchNorm2d(out_chs), | |||
| ) | |||
| def forward(self, x): | |||
| residual = x | |||
| x = self.ghost1(x) | |||
| if self.stride > 1: | |||
| x = self.conv_dw(x) | |||
| x = self.bn_dw(x) | |||
| if self.se is not None: | |||
| x = self.se(x) | |||
| x = self.ghost2(x) | |||
| x += self.shortcut(residual) | |||
| return x | |||
| class GhostBlocks(nn.Module): | |||
| """Stack of GhostBottleneck used in GhostPAN. | |||
| Args: | |||
| in_channels (int): Number of input channels. | |||
| out_channels (int): Number of output channels. | |||
| expand (int): Expand ratio of GhostBottleneck. Default: 1. | |||
| kernel_size (int): Kernel size of depthwise convolution. Default: 5. | |||
| num_blocks (int): Number of GhostBottlecneck blocks. Default: 1. | |||
| use_res (bool): Whether to use residual connection. Default: False. | |||
| activation (str): Name of activation function. Default: LeakyReLU. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| in_channels, | |||
| out_channels, | |||
| expand=1, | |||
| kernel_size=5, | |||
| num_blocks=1, | |||
| use_res=False, | |||
| activation='LeakyReLU', | |||
| ): | |||
| super(GhostBlocks, self).__init__() | |||
| self.use_res = use_res | |||
| if use_res: | |||
| self.reduce_conv = ConvModule( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size=1, | |||
| stride=1, | |||
| padding=0, | |||
| activation=activation, | |||
| ) | |||
| blocks = [] | |||
| for _ in range(num_blocks): | |||
| blocks.append( | |||
| GhostBottleneck( | |||
| in_channels, | |||
| int(out_channels * expand), | |||
| out_channels, | |||
| dw_kernel_size=kernel_size, | |||
| activation=activation, | |||
| )) | |||
| self.blocks = nn.Sequential(*blocks) | |||
| def forward(self, x): | |||
| out = self.blocks(x) | |||
| if self.use_res: | |||
| out = out + self.reduce_conv(x) | |||
| return out | |||
| class GhostPAN(nn.Module): | |||
| """Path Aggregation Network with Ghost block. | |||
| Args: | |||
| in_channels (List[int]): Number of input channels per scale. | |||
| out_channels (int): Number of output channels (used at each scale) | |||
| num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3 | |||
| use_depthwise (bool): Whether to depthwise separable convolution in | |||
| blocks. Default: False | |||
| kernel_size (int): Kernel size of depthwise convolution. Default: 5. | |||
| expand (int): Expand ratio of GhostBottleneck. Default: 1. | |||
| num_blocks (int): Number of GhostBottlecneck blocks. Default: 1. | |||
| use_res (bool): Whether to use residual connection. Default: False. | |||
| num_extra_level (int): Number of extra conv layers for more feature levels. | |||
| Default: 0. | |||
| upsample_cfg (dict): Config dict for interpolate layer. | |||
| Default: `dict(scale_factor=2, mode='nearest')` | |||
| norm_cfg (dict): Config dict for normalization layer. | |||
| Default: dict(type='BN') | |||
| activation (str): Activation layer name. | |||
| Default: LeakyReLU. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| in_channels, | |||
| out_channels, | |||
| use_depthwise=False, | |||
| kernel_size=5, | |||
| expand=1, | |||
| num_blocks=1, | |||
| use_res=False, | |||
| num_extra_level=0, | |||
| upsample_cfg=dict(scale_factor=2, mode='bilinear'), | |||
| norm_cfg=dict(type='BN'), | |||
| activation='LeakyReLU', | |||
| ): | |||
| super(GhostPAN, self).__init__() | |||
| assert num_extra_level >= 0 | |||
| assert num_blocks >= 1 | |||
| self.in_channels = in_channels | |||
| self.out_channels = out_channels | |||
| conv = DepthwiseConvModule if use_depthwise else ConvModule | |||
| # build top-down blocks | |||
| self.upsample = nn.Upsample(**upsample_cfg) | |||
| self.reduce_layers = nn.ModuleList() | |||
| for idx in range(len(in_channels)): | |||
| self.reduce_layers.append( | |||
| ConvModule( | |||
| in_channels[idx], | |||
| out_channels, | |||
| 1, | |||
| norm_cfg=norm_cfg, | |||
| activation=activation, | |||
| )) | |||
| self.top_down_blocks = nn.ModuleList() | |||
| for idx in range(len(in_channels) - 1, 0, -1): | |||
| self.top_down_blocks.append( | |||
| GhostBlocks( | |||
| out_channels * 2, | |||
| out_channels, | |||
| expand, | |||
| kernel_size=kernel_size, | |||
| num_blocks=num_blocks, | |||
| use_res=use_res, | |||
| activation=activation, | |||
| )) | |||
| # build bottom-up blocks | |||
| self.downsamples = nn.ModuleList() | |||
| self.bottom_up_blocks = nn.ModuleList() | |||
| for idx in range(len(in_channels) - 1): | |||
| self.downsamples.append( | |||
| conv( | |||
| out_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=2, | |||
| padding=kernel_size // 2, | |||
| norm_cfg=norm_cfg, | |||
| activation=activation, | |||
| )) | |||
| self.bottom_up_blocks.append( | |||
| GhostBlocks( | |||
| out_channels * 2, | |||
| out_channels, | |||
| expand, | |||
| kernel_size=kernel_size, | |||
| num_blocks=num_blocks, | |||
| use_res=use_res, | |||
| activation=activation, | |||
| )) | |||
| # extra layers | |||
| self.extra_lvl_in_conv = nn.ModuleList() | |||
| self.extra_lvl_out_conv = nn.ModuleList() | |||
| for i in range(num_extra_level): | |||
| self.extra_lvl_in_conv.append( | |||
| conv( | |||
| out_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=2, | |||
| padding=kernel_size // 2, | |||
| norm_cfg=norm_cfg, | |||
| activation=activation, | |||
| )) | |||
| self.extra_lvl_out_conv.append( | |||
| conv( | |||
| out_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=2, | |||
| padding=kernel_size // 2, | |||
| norm_cfg=norm_cfg, | |||
| activation=activation, | |||
| )) | |||
| def forward(self, inputs): | |||
| """ | |||
| Args: | |||
| inputs (tuple[Tensor]): input features. | |||
| Returns: | |||
| tuple[Tensor]: multi level features. | |||
| """ | |||
| assert len(inputs) == len(self.in_channels) | |||
| inputs = [ | |||
| reduce(input_x) | |||
| for input_x, reduce in zip(inputs, self.reduce_layers) | |||
| ] | |||
| # top-down path | |||
| inner_outs = [inputs[-1]] | |||
| for idx in range(len(self.in_channels) - 1, 0, -1): | |||
| feat_heigh = inner_outs[0] | |||
| feat_low = inputs[idx - 1] | |||
| inner_outs[0] = feat_heigh | |||
| upsample_feat = self.upsample(feat_heigh) | |||
| inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( | |||
| torch.cat([upsample_feat, feat_low], 1)) | |||
| inner_outs.insert(0, inner_out) | |||
| # bottom-up path | |||
| outs = [inner_outs[0]] | |||
| for idx in range(len(self.in_channels) - 1): | |||
| feat_low = outs[-1] | |||
| feat_height = inner_outs[idx + 1] | |||
| downsample_feat = self.downsamples[idx](feat_low) | |||
| out = self.bottom_up_blocks[idx]( | |||
| torch.cat([downsample_feat, feat_height], 1)) | |||
| outs.append(out) | |||
| # extra layers | |||
| for extra_in_layer, extra_out_layer in zip(self.extra_lvl_in_conv, | |||
| self.extra_lvl_out_conv): | |||
| outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1])) | |||
| return tuple(outs) | |||
| @@ -0,0 +1,427 @@ | |||
| # The implementation here is modified based on nanodet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||
| import math | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| from torchvision.ops import nms | |||
| from .utils import ConvModule, DepthwiseConvModule | |||
| class Integral(nn.Module): | |||
| """A fixed layer for calculating integral result from distribution. | |||
| This layer calculates the target location by :math: `sum{P(y_i) * y_i}`, | |||
| P(y_i) denotes the softmax vector that represents the discrete distribution | |||
| y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max} | |||
| Args: | |||
| reg_max (int): The maximal value of the discrete set. Default: 16. You | |||
| may want to reset it according to your new dataset or related | |||
| settings. | |||
| """ | |||
| def __init__(self, reg_max=16): | |||
| super(Integral, self).__init__() | |||
| self.reg_max = reg_max | |||
| self.register_buffer('project', | |||
| torch.linspace(0, self.reg_max, self.reg_max + 1)) | |||
| def forward(self, x): | |||
| """Forward feature from the regression head to get integral result of | |||
| bounding box location. | |||
| Args: | |||
| x (Tensor): Features of the regression head, shape (N, 4*(n+1)), | |||
| n is self.reg_max. | |||
| Returns: | |||
| x (Tensor): Integral result of box locations, i.e., distance | |||
| offsets from the box center in four directions, shape (N, 4). | |||
| """ | |||
| shape = x.size() | |||
| x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1) | |||
| x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4) | |||
| return x | |||
| def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False): | |||
| """Performs non-maximum suppression in a batched fashion. | |||
| Modified from https://github.com/pytorch/vision/blob | |||
| /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39. | |||
| In order to perform NMS independently per class, we add an offset to all | |||
| the boxes. The offset is dependent only on the class idx, and is large | |||
| enough so that boxes from different classes do not overlap. | |||
| Arguments: | |||
| boxes (torch.Tensor): boxes in shape (N, 4). | |||
| scores (torch.Tensor): scores in shape (N, ). | |||
| idxs (torch.Tensor): each index value correspond to a bbox cluster, | |||
| and NMS will not be applied between elements of different idxs, | |||
| shape (N, ). | |||
| nms_cfg (dict): specify nms type and other parameters like iou_thr. | |||
| Possible keys includes the following. | |||
| - iou_thr (float): IoU threshold used for NMS. | |||
| - split_thr (float): threshold number of boxes. In some cases the | |||
| number of boxes is large (e.g., 200k). To avoid OOM during | |||
| training, the users could set `split_thr` to a small value. | |||
| If the number of boxes is greater than the threshold, it will | |||
| perform NMS on each group of boxes separately and sequentially. | |||
| Defaults to 10000. | |||
| class_agnostic (bool): if true, nms is class agnostic, | |||
| i.e. IoU thresholding happens over all boxes, | |||
| regardless of the predicted class. | |||
| Returns: | |||
| tuple: kept dets and indice. | |||
| """ | |||
| nms_cfg_ = nms_cfg.copy() | |||
| class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic) | |||
| if class_agnostic: | |||
| boxes_for_nms = boxes | |||
| else: | |||
| max_coordinate = boxes.max() | |||
| offsets = idxs.to(boxes) * (max_coordinate + 1) | |||
| boxes_for_nms = boxes + offsets[:, None] | |||
| nms_cfg_.pop('type', 'nms') | |||
| split_thr = nms_cfg_.pop('split_thr', 10000) | |||
| if len(boxes_for_nms) < split_thr: | |||
| keep = nms(boxes_for_nms, scores, **nms_cfg_) | |||
| boxes = boxes[keep] | |||
| scores = scores[keep] | |||
| else: | |||
| total_mask = scores.new_zeros(scores.size(), dtype=torch.bool) | |||
| for id in torch.unique(idxs): | |||
| mask = (idxs == id).nonzero(as_tuple=False).view(-1) | |||
| keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_) | |||
| total_mask[mask[keep]] = True | |||
| keep = total_mask.nonzero(as_tuple=False).view(-1) | |||
| keep = keep[scores[keep].argsort(descending=True)] | |||
| boxes = boxes[keep] | |||
| scores = scores[keep] | |||
| return torch.cat([boxes, scores[:, None]], -1), keep | |||
| def multiclass_nms(multi_bboxes, | |||
| multi_scores, | |||
| score_thr, | |||
| nms_cfg, | |||
| max_num=-1, | |||
| score_factors=None): | |||
| """NMS for multi-class bboxes. | |||
| Args: | |||
| multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) | |||
| multi_scores (Tensor): shape (n, #class), where the last column | |||
| contains scores of the background class, but this will be ignored. | |||
| score_thr (float): bbox threshold, bboxes with scores lower than it | |||
| will not be considered. | |||
| nms_thr (float): NMS IoU threshold | |||
| max_num (int): if there are more than max_num bboxes after NMS, | |||
| only top max_num will be kept. | |||
| score_factors (Tensor): The factors multiplied to scores before | |||
| applying NMS | |||
| Returns: | |||
| tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \ | |||
| are 0-based. | |||
| """ | |||
| num_classes = multi_scores.size(1) - 1 | |||
| if multi_bboxes.shape[1] > 4: | |||
| bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) | |||
| else: | |||
| bboxes = multi_bboxes[:, None].expand( | |||
| multi_scores.size(0), num_classes, 4) | |||
| scores = multi_scores[:, :-1] | |||
| valid_mask = scores > score_thr | |||
| bboxes = torch.masked_select( | |||
| bboxes, | |||
| torch.stack((valid_mask, valid_mask, valid_mask, valid_mask), | |||
| -1)).view(-1, 4) | |||
| if score_factors is not None: | |||
| scores = scores * score_factors[:, None] | |||
| scores = torch.masked_select(scores, valid_mask) | |||
| labels = valid_mask.nonzero(as_tuple=False)[:, 1] | |||
| if bboxes.numel() == 0: | |||
| bboxes = multi_bboxes.new_zeros((0, 5)) | |||
| labels = multi_bboxes.new_zeros((0, ), dtype=torch.long) | |||
| if torch.onnx.is_in_onnx_export(): | |||
| raise RuntimeError('[ONNX Error] Can not record NMS ' | |||
| 'as it has not been executed this time') | |||
| return bboxes, labels | |||
| dets, keep = batched_nms(bboxes, scores, labels, nms_cfg) | |||
| if max_num > 0: | |||
| dets = dets[:max_num] | |||
| keep = keep[:max_num] | |||
| return dets, labels[keep] | |||
| def distance2bbox(points, distance, max_shape=None): | |||
| """Decode distance prediction to bounding box. | |||
| Args: | |||
| points (Tensor): Shape (n, 2), [x, y]. | |||
| distance (Tensor): Distance from the given point to 4 | |||
| boundaries (left, top, right, bottom). | |||
| max_shape (tuple): Shape of the image. | |||
| Returns: | |||
| Tensor: Decoded bboxes. | |||
| """ | |||
| x1 = points[..., 0] - distance[..., 0] | |||
| y1 = points[..., 1] - distance[..., 1] | |||
| x2 = points[..., 0] + distance[..., 2] | |||
| y2 = points[..., 1] + distance[..., 3] | |||
| if max_shape is not None: | |||
| x1 = x1.clamp(min=0, max=max_shape[1]) | |||
| y1 = y1.clamp(min=0, max=max_shape[0]) | |||
| x2 = x2.clamp(min=0, max=max_shape[1]) | |||
| y2 = y2.clamp(min=0, max=max_shape[0]) | |||
| return torch.stack([x1, y1, x2, y2], -1) | |||
| def warp_boxes(boxes, M, width, height): | |||
| n = len(boxes) | |||
| if n: | |||
| xy = np.ones((n * 4, 3)) | |||
| xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) | |||
| xy = xy @ M.T | |||
| xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) | |||
| x = xy[:, [0, 2, 4, 6]] | |||
| y = xy[:, [1, 3, 5, 7]] | |||
| xy = np.concatenate( | |||
| (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T | |||
| xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) | |||
| xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) | |||
| return xy.astype(np.float32) | |||
| else: | |||
| return boxes | |||
| class NanoDetPlusHead(nn.Module): | |||
| """Detection head used in NanoDet-Plus. | |||
| Args: | |||
| num_classes (int): Number of categories excluding the background | |||
| category. | |||
| loss (dict): Loss config. | |||
| input_channel (int): Number of channels of the input feature. | |||
| feat_channels (int): Number of channels of the feature. | |||
| Default: 96. | |||
| stacked_convs (int): Number of conv layers in the stacked convs. | |||
| Default: 2. | |||
| kernel_size (int): Size of the convolving kernel. Default: 5. | |||
| strides (list[int]): Strides of input multi-level feature maps. | |||
| Default: [8, 16, 32]. | |||
| conv_type (str): Type of the convolution. | |||
| Default: "DWConv". | |||
| norm_cfg (dict): Dictionary to construct and config norm layer. | |||
| Default: dict(type='BN'). | |||
| reg_max (int): The maximal value of the discrete set. Default: 7. | |||
| activation (str): Type of activation function. Default: "LeakyReLU". | |||
| assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13). | |||
| """ | |||
| def __init__(self, | |||
| num_classes, | |||
| input_channel, | |||
| feat_channels=96, | |||
| stacked_convs=2, | |||
| kernel_size=5, | |||
| strides=[8, 16, 32], | |||
| conv_type='DWConv', | |||
| norm_cfg=dict(type='BN'), | |||
| reg_max=7, | |||
| activation='LeakyReLU', | |||
| assigner_cfg=dict(topk=13), | |||
| **kwargs): | |||
| super(NanoDetPlusHead, self).__init__() | |||
| self.num_classes = num_classes | |||
| self.in_channels = input_channel | |||
| self.feat_channels = feat_channels | |||
| self.stacked_convs = stacked_convs | |||
| self.kernel_size = kernel_size | |||
| self.strides = strides | |||
| self.reg_max = reg_max | |||
| self.activation = activation | |||
| self.ConvModule = ConvModule if conv_type == 'Conv' else DepthwiseConvModule | |||
| self.norm_cfg = norm_cfg | |||
| self.distribution_project = Integral(self.reg_max) | |||
| self._init_layers() | |||
| def _init_layers(self): | |||
| self.cls_convs = nn.ModuleList() | |||
| for _ in self.strides: | |||
| cls_convs = self._buid_not_shared_head() | |||
| self.cls_convs.append(cls_convs) | |||
| self.gfl_cls = nn.ModuleList([ | |||
| nn.Conv2d( | |||
| self.feat_channels, | |||
| self.num_classes + 4 * (self.reg_max + 1), | |||
| 1, | |||
| padding=0, | |||
| ) for _ in self.strides | |||
| ]) | |||
| def _buid_not_shared_head(self): | |||
| cls_convs = nn.ModuleList() | |||
| for i in range(self.stacked_convs): | |||
| chn = self.in_channels if i == 0 else self.feat_channels | |||
| cls_convs.append( | |||
| self.ConvModule( | |||
| chn, | |||
| self.feat_channels, | |||
| self.kernel_size, | |||
| stride=1, | |||
| padding=self.kernel_size // 2, | |||
| norm_cfg=self.norm_cfg, | |||
| bias=self.norm_cfg is None, | |||
| activation=self.activation, | |||
| )) | |||
| return cls_convs | |||
| def forward(self, feats): | |||
| if torch.onnx.is_in_onnx_export(): | |||
| return self._forward_onnx(feats) | |||
| outputs = [] | |||
| for feat, cls_convs, gfl_cls in zip( | |||
| feats, | |||
| self.cls_convs, | |||
| self.gfl_cls, | |||
| ): | |||
| for conv in cls_convs: | |||
| feat = conv(feat) | |||
| output = gfl_cls(feat) | |||
| outputs.append(output.flatten(start_dim=2)) | |||
| outputs = torch.cat(outputs, dim=2).permute(0, 2, 1) | |||
| return outputs | |||
| def post_process(self, preds, meta): | |||
| """Prediction results post processing. Decode bboxes and rescale | |||
| to original image size. | |||
| Args: | |||
| preds (Tensor): Prediction output. | |||
| meta (dict): Meta info. | |||
| """ | |||
| cls_scores, bbox_preds = preds.split( | |||
| [self.num_classes, 4 * (self.reg_max + 1)], dim=-1) | |||
| result_list = self.get_bboxes(cls_scores, bbox_preds, meta) | |||
| det_results = {} | |||
| warp_matrixes = ( | |||
| meta['warp_matrix'] | |||
| if isinstance(meta['warp_matrix'], list) else meta['warp_matrix']) | |||
| img_heights = ( | |||
| meta['img_info']['height'].cpu().numpy() if isinstance( | |||
| meta['img_info']['height'], torch.Tensor) else | |||
| meta['img_info']['height']) | |||
| img_widths = ( | |||
| meta['img_info']['width'].cpu().numpy() if isinstance( | |||
| meta['img_info']['width'], torch.Tensor) else | |||
| meta['img_info']['width']) | |||
| img_ids = ( | |||
| meta['img_info']['id'].cpu().numpy() if isinstance( | |||
| meta['img_info']['id'], torch.Tensor) else | |||
| meta['img_info']['id']) | |||
| for result, img_width, img_height, img_id, warp_matrix in zip( | |||
| result_list, img_widths, img_heights, img_ids, warp_matrixes): | |||
| det_result = {} | |||
| det_bboxes, det_labels = result | |||
| det_bboxes = det_bboxes.detach().cpu().numpy() | |||
| det_bboxes[:, :4] = warp_boxes(det_bboxes[:, :4], | |||
| np.linalg.inv(warp_matrix), | |||
| img_width, img_height) | |||
| classes = det_labels.detach().cpu().numpy() | |||
| for i in range(self.num_classes): | |||
| inds = classes == i | |||
| det_result[i] = np.concatenate( | |||
| [ | |||
| det_bboxes[inds, :4].astype(np.float32), | |||
| det_bboxes[inds, 4:5].astype(np.float32), | |||
| ], | |||
| axis=1, | |||
| ).tolist() | |||
| det_results[img_id] = det_result | |||
| return det_results | |||
| def get_bboxes(self, cls_preds, reg_preds, img_metas): | |||
| """Decode the outputs to bboxes. | |||
| Args: | |||
| cls_preds (Tensor): Shape (num_imgs, num_points, num_classes). | |||
| reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)). | |||
| img_metas (dict): Dict of image info. | |||
| Returns: | |||
| results_list (list[tuple]): List of detection bboxes and labels. | |||
| """ | |||
| device = cls_preds.device | |||
| b = cls_preds.shape[0] | |||
| input_height, input_width = img_metas['img'].shape[2:] | |||
| input_shape = (input_height, input_width) | |||
| featmap_sizes = [(math.ceil(input_height / stride), | |||
| math.ceil(input_width) / stride) | |||
| for stride in self.strides] | |||
| mlvl_center_priors = [ | |||
| self.get_single_level_center_priors( | |||
| b, | |||
| featmap_sizes[i], | |||
| stride, | |||
| dtype=torch.float32, | |||
| device=device, | |||
| ) for i, stride in enumerate(self.strides) | |||
| ] | |||
| center_priors = torch.cat(mlvl_center_priors, dim=1) | |||
| dis_preds = self.distribution_project(reg_preds) * center_priors[..., | |||
| 2, | |||
| None] | |||
| bboxes = distance2bbox( | |||
| center_priors[..., :2], dis_preds, max_shape=input_shape) | |||
| scores = cls_preds.sigmoid() | |||
| result_list = [] | |||
| for i in range(b): | |||
| score, bbox = scores[i], bboxes[i] | |||
| padding = score.new_zeros(score.shape[0], 1) | |||
| score = torch.cat([score, padding], dim=1) | |||
| results = multiclass_nms( | |||
| bbox, | |||
| score, | |||
| score_thr=0.05, | |||
| nms_cfg=dict(type='nms', iou_threshold=0.6), | |||
| max_num=100, | |||
| ) | |||
| result_list.append(results) | |||
| return result_list | |||
| def get_single_level_center_priors(self, batch_size, featmap_size, stride, | |||
| dtype, device): | |||
| """Generate centers of a single stage feature map. | |||
| Args: | |||
| batch_size (int): Number of images in one batch. | |||
| featmap_size (tuple[int]): height and width of the feature map | |||
| stride (int): down sample stride of the feature map | |||
| dtype (obj:`torch.dtype`): data type of the tensors | |||
| device (obj:`torch.device`): device of the tensors | |||
| Return: | |||
| priors (Tensor): center priors of a single level feature map. | |||
| """ | |||
| h, w = featmap_size | |||
| x_range = (torch.arange(w, dtype=dtype, device=device)) * stride | |||
| y_range = (torch.arange(h, dtype=dtype, device=device)) * stride | |||
| y, x = torch.meshgrid(y_range, x_range) | |||
| y = y.flatten() | |||
| x = x.flatten() | |||
| strides = x.new_full((x.shape[0], ), stride) | |||
| proiors = torch.stack([x, y, strides, strides], dim=-1) | |||
| return proiors.unsqueeze(0).repeat(batch_size, 1, 1) | |||
| @@ -0,0 +1,64 @@ | |||
| # The implementation here is modified based on nanodet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||
| import torch | |||
| import torch.nn as nn | |||
| from .ghost_pan import GhostPAN | |||
| from .nanodet_plus_head import NanoDetPlusHead | |||
| from .shufflenetv2 import ShuffleNetV2 | |||
| class OneStageDetector(nn.Module): | |||
| def __init__(self): | |||
| super(OneStageDetector, self).__init__() | |||
| self.backbone = ShuffleNetV2( | |||
| model_size='1.0x', | |||
| out_stages=(2, 3, 4), | |||
| with_last_conv=False, | |||
| kernal_size=3, | |||
| activation='LeakyReLU', | |||
| pretrain=False) | |||
| self.fpn = GhostPAN( | |||
| in_channels=[116, 232, 464], | |||
| out_channels=96, | |||
| use_depthwise=True, | |||
| kernel_size=5, | |||
| expand=1, | |||
| num_blocks=1, | |||
| use_res=False, | |||
| num_extra_level=1, | |||
| upsample_cfg=dict(scale_factor=2, mode='bilinear'), | |||
| norm_cfg=dict(type='BN'), | |||
| activation='LeakyReLU') | |||
| self.head = NanoDetPlusHead( | |||
| num_classes=3, | |||
| input_channel=96, | |||
| feat_channels=96, | |||
| stacked_convs=2, | |||
| kernel_size=5, | |||
| strides=[8, 16, 32, 64], | |||
| conv_type='DWConv', | |||
| norm_cfg=dict(type='BN'), | |||
| reg_max=7, | |||
| activation='LeakyReLU', | |||
| assigner_cfg=dict(topk=13)) | |||
| self.epoch = 0 | |||
| def forward(self, x): | |||
| x = self.backbone(x) | |||
| if hasattr(self, 'fpn'): | |||
| x = self.fpn(x) | |||
| if hasattr(self, 'head'): | |||
| x = self.head(x) | |||
| return x | |||
| def inference(self, meta): | |||
| with torch.no_grad(): | |||
| torch.cuda.synchronize() | |||
| preds = self(meta['img']) | |||
| torch.cuda.synchronize() | |||
| results = self.head.post_process(preds, meta) | |||
| torch.cuda.synchronize() | |||
| return results | |||
| @@ -0,0 +1,182 @@ | |||
| # The implementation here is modified based on nanodet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||
| import torch | |||
| import torch.nn as nn | |||
| from .utils import act_layers | |||
| def channel_shuffle(x, groups): | |||
| batchsize, num_channels, height, width = x.data.size() | |||
| channels_per_group = num_channels // groups | |||
| x = x.view(batchsize, groups, channels_per_group, height, width) | |||
| x = torch.transpose(x, 1, 2).contiguous() | |||
| x = x.view(batchsize, -1, height, width) | |||
| return x | |||
| class ShuffleV2Block(nn.Module): | |||
| def __init__(self, inp, oup, stride, activation='ReLU'): | |||
| super(ShuffleV2Block, self).__init__() | |||
| if not (1 <= stride <= 3): | |||
| raise ValueError('illegal stride value') | |||
| self.stride = stride | |||
| branch_features = oup // 2 | |||
| assert (self.stride != 1) or (inp == branch_features << 1) | |||
| if self.stride > 1: | |||
| self.branch1 = nn.Sequential( | |||
| self.depthwise_conv( | |||
| inp, inp, kernel_size=3, stride=self.stride, padding=1), | |||
| nn.BatchNorm2d(inp), | |||
| nn.Conv2d( | |||
| inp, | |||
| branch_features, | |||
| kernel_size=1, | |||
| stride=1, | |||
| padding=0, | |||
| bias=False), | |||
| nn.BatchNorm2d(branch_features), | |||
| act_layers(activation), | |||
| ) | |||
| else: | |||
| self.branch1 = nn.Sequential() | |||
| self.branch2 = nn.Sequential( | |||
| nn.Conv2d( | |||
| inp if (self.stride > 1) else branch_features, | |||
| branch_features, | |||
| kernel_size=1, | |||
| stride=1, | |||
| padding=0, | |||
| bias=False, | |||
| ), | |||
| nn.BatchNorm2d(branch_features), | |||
| act_layers(activation), | |||
| self.depthwise_conv( | |||
| branch_features, | |||
| branch_features, | |||
| kernel_size=3, | |||
| stride=self.stride, | |||
| padding=1, | |||
| ), | |||
| nn.BatchNorm2d(branch_features), | |||
| nn.Conv2d( | |||
| branch_features, | |||
| branch_features, | |||
| kernel_size=1, | |||
| stride=1, | |||
| padding=0, | |||
| bias=False, | |||
| ), | |||
| nn.BatchNorm2d(branch_features), | |||
| act_layers(activation), | |||
| ) | |||
| @staticmethod | |||
| def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): | |||
| return nn.Conv2d( | |||
| i, o, kernel_size, stride, padding, bias=bias, groups=i) | |||
| def forward(self, x): | |||
| if self.stride == 1: | |||
| x1, x2 = x.chunk(2, dim=1) | |||
| out = torch.cat((x1, self.branch2(x2)), dim=1) | |||
| else: | |||
| out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) | |||
| out = channel_shuffle(out, 2) | |||
| return out | |||
| class ShuffleNetV2(nn.Module): | |||
| def __init__( | |||
| self, | |||
| model_size='1.5x', | |||
| out_stages=(2, 3, 4), | |||
| with_last_conv=False, | |||
| kernal_size=3, | |||
| activation='ReLU', | |||
| pretrain=True, | |||
| ): | |||
| super(ShuffleNetV2, self).__init__() | |||
| assert set(out_stages).issubset((2, 3, 4)) | |||
| print('model size is ', model_size) | |||
| self.stage_repeats = [4, 8, 4] | |||
| self.model_size = model_size | |||
| self.out_stages = out_stages | |||
| self.with_last_conv = with_last_conv | |||
| self.kernal_size = kernal_size | |||
| self.activation = activation | |||
| if model_size == '0.5x': | |||
| self._stage_out_channels = [24, 48, 96, 192, 1024] | |||
| elif model_size == '1.0x': | |||
| self._stage_out_channels = [24, 116, 232, 464, 1024] | |||
| elif model_size == '1.5x': | |||
| self._stage_out_channels = [24, 176, 352, 704, 1024] | |||
| elif model_size == '2.0x': | |||
| self._stage_out_channels = [24, 244, 488, 976, 2048] | |||
| else: | |||
| raise NotImplementedError | |||
| # building first layer | |||
| input_channels = 3 | |||
| output_channels = self._stage_out_channels[0] | |||
| self.conv1 = nn.Sequential( | |||
| nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False), | |||
| nn.BatchNorm2d(output_channels), | |||
| act_layers(activation), | |||
| ) | |||
| input_channels = output_channels | |||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) | |||
| stage_names = ['stage{}'.format(i) for i in [2, 3, 4]] | |||
| for name, repeats, output_channels in zip( | |||
| stage_names, self.stage_repeats, self._stage_out_channels[1:]): | |||
| seq = [ | |||
| ShuffleV2Block( | |||
| input_channels, output_channels, 2, activation=activation) | |||
| ] | |||
| for i in range(repeats - 1): | |||
| seq.append( | |||
| ShuffleV2Block( | |||
| output_channels, | |||
| output_channels, | |||
| 1, | |||
| activation=activation)) | |||
| setattr(self, name, nn.Sequential(*seq)) | |||
| input_channels = output_channels | |||
| output_channels = self._stage_out_channels[-1] | |||
| if self.with_last_conv: | |||
| conv5 = nn.Sequential( | |||
| nn.Conv2d( | |||
| input_channels, output_channels, 1, 1, 0, bias=False), | |||
| nn.BatchNorm2d(output_channels), | |||
| act_layers(activation), | |||
| ) | |||
| self.stage4.add_module('conv5', conv5) | |||
| def forward(self, x): | |||
| x = self.conv1(x) | |||
| x = self.maxpool(x) | |||
| output = [] | |||
| for i in range(2, 5): | |||
| stage = getattr(self, 'stage{}'.format(i)) | |||
| x = stage(x) | |||
| if i in self.out_stages: | |||
| output.append(x) | |||
| return tuple(output) | |||
| @@ -0,0 +1,277 @@ | |||
| # The implementation here is modified based on nanodet, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet | |||
| import torch | |||
| import torch.nn as nn | |||
| activations = { | |||
| 'ReLU': nn.ReLU, | |||
| 'LeakyReLU': nn.LeakyReLU, | |||
| 'ReLU6': nn.ReLU6, | |||
| 'SELU': nn.SELU, | |||
| 'ELU': nn.ELU, | |||
| 'GELU': nn.GELU, | |||
| 'PReLU': nn.PReLU, | |||
| 'SiLU': nn.SiLU, | |||
| 'HardSwish': nn.Hardswish, | |||
| 'Hardswish': nn.Hardswish, | |||
| None: nn.Identity, | |||
| } | |||
| def act_layers(name): | |||
| assert name in activations.keys() | |||
| if name == 'LeakyReLU': | |||
| return nn.LeakyReLU(negative_slope=0.1, inplace=True) | |||
| elif name == 'GELU': | |||
| return nn.GELU() | |||
| elif name == 'PReLU': | |||
| return nn.PReLU() | |||
| else: | |||
| return activations[name](inplace=True) | |||
| norm_cfg = { | |||
| 'BN': ('bn', nn.BatchNorm2d), | |||
| 'SyncBN': ('bn', nn.SyncBatchNorm), | |||
| 'GN': ('gn', nn.GroupNorm), | |||
| } | |||
| def build_norm_layer(cfg, num_features, postfix=''): | |||
| """Build normalization layer | |||
| Args: | |||
| cfg (dict): cfg should contain: | |||
| type (str): identify norm layer type. | |||
| layer args: args needed to instantiate a norm layer. | |||
| requires_grad (bool): [optional] whether stop gradient updates | |||
| num_features (int): number of channels from input. | |||
| postfix (int, str): appended into norm abbreviation to | |||
| create named layer. | |||
| Returns: | |||
| name (str): abbreviation + postfix | |||
| layer (nn.Module): created norm layer | |||
| """ | |||
| assert isinstance(cfg, dict) and 'type' in cfg | |||
| cfg_ = cfg.copy() | |||
| layer_type = cfg_.pop('type') | |||
| if layer_type not in norm_cfg: | |||
| raise KeyError('Unrecognized norm type {}'.format(layer_type)) | |||
| else: | |||
| abbr, norm_layer = norm_cfg[layer_type] | |||
| if norm_layer is None: | |||
| raise NotImplementedError | |||
| assert isinstance(postfix, (int, str)) | |||
| name = abbr + str(postfix) | |||
| requires_grad = cfg_.pop('requires_grad', True) | |||
| cfg_.setdefault('eps', 1e-5) | |||
| if layer_type != 'GN': | |||
| layer = norm_layer(num_features, **cfg_) | |||
| if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): | |||
| layer._specify_ddp_gpu_num(1) | |||
| else: | |||
| assert 'num_groups' in cfg_ | |||
| layer = norm_layer(num_channels=num_features, **cfg_) | |||
| for param in layer.parameters(): | |||
| param.requires_grad = requires_grad | |||
| return name, layer | |||
| class ConvModule(nn.Module): | |||
| """A conv block that contains conv/norm/activation layers. | |||
| Args: | |||
| in_channels (int): Same as nn.Conv2d. | |||
| out_channels (int): Same as nn.Conv2d. | |||
| kernel_size (int or tuple[int]): Same as nn.Conv2d. | |||
| stride (int or tuple[int]): Same as nn.Conv2d. | |||
| padding (int or tuple[int]): Same as nn.Conv2d. | |||
| dilation (int or tuple[int]): Same as nn.Conv2d. | |||
| groups (int): Same as nn.Conv2d. | |||
| bias (bool or str): If specified as `auto`, it will be decided by the | |||
| norm_cfg. Bias will be set as True if norm_cfg is None, otherwise | |||
| False. | |||
| conv_cfg (dict): Config dict for convolution layer. | |||
| norm_cfg (dict): Config dict for normalization layer. | |||
| activation (str): activation layer, "ReLU" by default. | |||
| inplace (bool): Whether to use inplace mode for activation. | |||
| order (tuple[str]): The order of conv/norm/activation layers. It is a | |||
| sequence of "conv", "norm" and "act". Examples are | |||
| ("conv", "norm", "act") and ("act", "conv", "norm"). | |||
| """ | |||
| def __init__( | |||
| self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| groups=1, | |||
| bias='auto', | |||
| conv_cfg=None, | |||
| norm_cfg=None, | |||
| activation='ReLU', | |||
| inplace=True, | |||
| order=('conv', 'norm', 'act'), | |||
| ): | |||
| super(ConvModule, self).__init__() | |||
| assert conv_cfg is None or isinstance(conv_cfg, dict) | |||
| assert norm_cfg is None or isinstance(norm_cfg, dict) | |||
| assert activation is None or isinstance(activation, str) | |||
| self.conv_cfg = conv_cfg | |||
| self.norm_cfg = norm_cfg | |||
| self.activation = activation | |||
| self.inplace = inplace | |||
| self.order = order | |||
| assert isinstance(self.order, tuple) and len(self.order) == 3 | |||
| assert set(order) == {'conv', 'norm', 'act'} | |||
| self.with_norm = norm_cfg is not None | |||
| if bias == 'auto': | |||
| bias = False if self.with_norm else True | |||
| self.with_bias = bias | |||
| if self.with_norm and self.with_bias: | |||
| warnings.warn('ConvModule has norm and bias at the same time') | |||
| self.conv = nn.Conv2d( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=stride, | |||
| padding=padding, | |||
| dilation=dilation, | |||
| groups=groups, | |||
| bias=bias, | |||
| ) | |||
| self.in_channels = self.conv.in_channels | |||
| self.out_channels = self.conv.out_channels | |||
| self.kernel_size = self.conv.kernel_size | |||
| self.stride = self.conv.stride | |||
| self.padding = self.conv.padding | |||
| self.dilation = self.conv.dilation | |||
| self.transposed = self.conv.transposed | |||
| self.output_padding = self.conv.output_padding | |||
| self.groups = self.conv.groups | |||
| if self.with_norm: | |||
| if order.index('norm') > order.index('conv'): | |||
| norm_channels = out_channels | |||
| else: | |||
| norm_channels = in_channels | |||
| self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) | |||
| self.add_module(self.norm_name, norm) | |||
| else: | |||
| self.norm_name = None | |||
| if self.activation: | |||
| self.act = act_layers(self.activation) | |||
| @property | |||
| def norm(self): | |||
| if self.norm_name: | |||
| return getattr(self, self.norm_name) | |||
| else: | |||
| return None | |||
| def forward(self, x, norm=True): | |||
| for layer in self.order: | |||
| if layer == 'conv': | |||
| x = self.conv(x) | |||
| elif layer == 'norm' and norm and self.with_norm: | |||
| x = self.norm(x) | |||
| elif layer == 'act' and self.activation: | |||
| x = self.act(x) | |||
| return x | |||
| class DepthwiseConvModule(nn.Module): | |||
| def __init__( | |||
| self, | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size, | |||
| stride=1, | |||
| padding=0, | |||
| dilation=1, | |||
| bias='auto', | |||
| norm_cfg=dict(type='BN'), | |||
| activation='ReLU', | |||
| inplace=True, | |||
| order=('depthwise', 'dwnorm', 'act', 'pointwise', 'pwnorm', 'act'), | |||
| ): | |||
| super(DepthwiseConvModule, self).__init__() | |||
| assert activation is None or isinstance(activation, str) | |||
| self.activation = activation | |||
| self.inplace = inplace | |||
| self.order = order | |||
| assert isinstance(self.order, tuple) and len(self.order) == 6 | |||
| assert set(order) == { | |||
| 'depthwise', | |||
| 'dwnorm', | |||
| 'act', | |||
| 'pointwise', | |||
| 'pwnorm', | |||
| 'act', | |||
| } | |||
| self.with_norm = norm_cfg is not None | |||
| if bias == 'auto': | |||
| bias = False if self.with_norm else True | |||
| self.with_bias = bias | |||
| if self.with_norm and self.with_bias: | |||
| warnings.warn('ConvModule has norm and bias at the same time') | |||
| self.depthwise = nn.Conv2d( | |||
| in_channels, | |||
| in_channels, | |||
| kernel_size, | |||
| stride=stride, | |||
| padding=padding, | |||
| dilation=dilation, | |||
| groups=in_channels, | |||
| bias=bias, | |||
| ) | |||
| self.pointwise = nn.Conv2d( | |||
| in_channels, | |||
| out_channels, | |||
| kernel_size=1, | |||
| stride=1, | |||
| padding=0, | |||
| bias=bias) | |||
| self.in_channels = self.depthwise.in_channels | |||
| self.out_channels = self.pointwise.out_channels | |||
| self.kernel_size = self.depthwise.kernel_size | |||
| self.stride = self.depthwise.stride | |||
| self.padding = self.depthwise.padding | |||
| self.dilation = self.depthwise.dilation | |||
| self.transposed = self.depthwise.transposed | |||
| self.output_padding = self.depthwise.output_padding | |||
| if self.with_norm: | |||
| _, self.dwnorm = build_norm_layer(norm_cfg, in_channels) | |||
| _, self.pwnorm = build_norm_layer(norm_cfg, out_channels) | |||
| if self.activation: | |||
| self.act = act_layers(self.activation) | |||
| def forward(self, x, norm=True): | |||
| for layer_name in self.order: | |||
| if layer_name != 'act': | |||
| layer = self.__getattr__(layer_name) | |||
| x = layer(x) | |||
| elif layer_name == 'act' and self.activation: | |||
| x = self.act(x) | |||
| return x | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation here is modified based on DeOldify, originally MIT License | |||
| # and publicly available at https://github.com/jantic/DeOldify/blob/master/deoldify/unet.py | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation here is modified based on DeOldify, originally MIT License and | |||
| # publicly available at https://github.com/jantic/DeOldify/blob/master/fastai/callbacks/hooks.py | |||
| import functools | |||
| from enum import Enum | |||
| @@ -1,3 +1,5 @@ | |||
| # Part of the implementation is borrowed and modified from Face-Alignment, | |||
| # publicly available at https://github.com/foamliu/Face-Alignment/blob/master/align_faces.py | |||
| import cv2 | |||
| import numpy as np | |||
| from skimage import transform as trans | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| import cv2 | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from FaceQuality, made publicly available under the MIT License | |||
| # at https://github.com/deepcam-cn/FaceQuality/blob/master/models/model_resnet.py | |||
| import torch | |||
| from torch import nn | |||
| @@ -1,3 +1,5 @@ | |||
| # The GPEN implementation is also open-sourced by the authors, | |||
| # and available at https://github.com/yangxy/GPEN/blob/main/face_model/gpen_model.py | |||
| import functools | |||
| import itertools | |||
| import math | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import math | |||
| import os.path as osp | |||
| from copy import deepcopy | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from InsightFace_Pytorch, | |||
| # made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py | |||
| from collections import namedtuple | |||
| import torch | |||
| @@ -1,3 +1,5 @@ | |||
| # The GPEN implementation is also open-sourced by the authors, | |||
| # and available at https://github.com/yangxy/GPEN/tree/main/training/loss/id_loss.py | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from InsightFace_Pytorch, | |||
| # made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py | |||
| from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear, | |||
| Module, PReLU, Sequential) | |||
| @@ -1,3 +1,5 @@ | |||
| # The GPEN implementation is also open-sourced by the authors, | |||
| # and available at https://github.com/yangxy/GPEN/blob/main/face_detect/retinaface_detection.py | |||
| import os | |||
| import cv2 | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License | |||
| # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py | |||
| import time | |||
| import torch | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License | |||
| # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py | |||
| from collections import OrderedDict | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,5 @@ | |||
| # Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP. | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import random | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,5 @@ | |||
| # Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP. | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| # APPs that facilitate the use of pretrained neural networks. | |||
| import os.path as osp | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import os | |||
| import random | |||
| @@ -1,3 +1,6 @@ | |||
| # Part of the implementation is borrowed and modified from latent-diffusion, | |||
| # publicly avaialbe at https://github.com/CompVis/latent-diffusion. | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import math | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import numpy as np | |||
| import scipy.linalg as linalg | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import colorsys | |||
| import random | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import cv2 | |||
| import numpy as np | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| r"""SVD of linear degradation matrices described in the paper | |||
| ``Denoising Diffusion Restoration Models.'' | |||
| @article{kawar2022denoising, | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import base64 | |||
| import binascii | |||
| import hashlib | |||
| @@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |||
| ]) | |||
| self.infer_result = {'vid': [], 'sid': [], 'pred': []} | |||
| sampling_method = self.cfg.dataset.sampling_method.name | |||
| self.neighbor_size = self.cfg.dataset.sampling_method.params[ | |||
| sampling_method].neighbor_size | |||
| @@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| shot_num = len(sids) | |||
| cnt = shot_num // bs + 1 | |||
| infer_sid, infer_pred = [], [] | |||
| infer_result = {} | |||
| for i in range(cnt): | |||
| start = i * bs | |||
| end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num | |||
| @@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| input_ = torch.stack(input_) | |||
| outputs = self.shared_step(input_) # shape [b,2] | |||
| prob = F.softmax(outputs, dim=1) | |||
| self.infer_result['sid'].extend(sid_.cpu().detach().numpy()) | |||
| self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy()) | |||
| self.infer_result['pred'] = np.stack(self.infer_result['pred']) | |||
| infer_sid.extend(sid_.cpu().detach().numpy()) | |||
| infer_pred.extend(prob[:, 1].cpu().detach().numpy()) | |||
| infer_result.update({'pred': np.stack(infer_pred)}) | |||
| infer_result.update({'sid': infer_sid}) | |||
| assert len(self.infer_result['sid']) == len(sids) | |||
| assert len(self.infer_result['pred']) == len(inputs) | |||
| return self.infer_result | |||
| assert len(infer_result['sid']) == len(sids) | |||
| assert len(infer_result['pred']) == len(inputs) | |||
| return infer_result | |||
| def shared_step(self, inputs): | |||
| with torch.no_grad(): | |||
| @@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel): | |||
| thres = self.cfg.pipeline.save_threshold | |||
| anno_dict = get_pred_boundary(pred_dict, thres) | |||
| scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict) | |||
| scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( | |||
| self.shot2keyf, anno_dict) | |||
| if self.cfg.pipeline.save_split_scene: | |||
| re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) | |||
| print(f'Split scene video saved to {re_dir}') | |||
| return len(scene_list), scene_dict_lst | |||
| return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst | |||
| def preprocess(self, inputs): | |||
| logger.info('Begin shot detect......') | |||
| @@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict): | |||
| scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict) | |||
| scene_dict_lst = [] | |||
| shot_num = len(shot2keyf) | |||
| shot_dict_lst = [] | |||
| for item in shot2keyf: | |||
| tmp = item.split(' ') | |||
| shot_dict_lst.append({ | |||
| 'frame': [tmp[0], tmp[1]], | |||
| 'timestamps': [tmp[-2], tmp[-1]] | |||
| }) | |||
| assert len(scene_list) == len(pair_list) | |||
| for scene_ind, scene_item in enumerate(scene_list): | |||
| scene_dict_lst.append({ | |||
| 'shot': pair_list[scene_ind], | |||
| 'frame': scene_item[0], | |||
| 'timestamp': scene_item[1] | |||
| 'timestamps': scene_item[1] | |||
| }) | |||
| return scene_dict_lst, scene_list | |||
| return scene_dict_lst, scene_list, shot_num, shot_dict_lst | |||
| def scene2video(source_movie_fn, scene_list, thres): | |||
| @@ -0,0 +1,20 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| from typing import TYPE_CHECKING | |||
| from modelscope.utils.import_utils import LazyImportModule | |||
| if TYPE_CHECKING: | |||
| from .seg_infer import F3NetProductSegmentation | |||
| else: | |||
| _import_structure = {'seg_infer': ['F3NetProductSegmentation']} | |||
| import sys | |||
| sys.modules[__name__] = LazyImportModule( | |||
| __name__, | |||
| globals()['__file__'], | |||
| _import_structure, | |||
| module_spec=__spec__, | |||
| extra_objects={}, | |||
| ) | |||
| @@ -0,0 +1,197 @@ | |||
| # The implementation here is modified based on F3Net, | |||
| # originally Apache 2.0 License and publicly avaialbe at https://github.com/weijun88/F3Net | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class Bottleneck(nn.Module): | |||
| def __init__(self, | |||
| inplanes, | |||
| planes, | |||
| stride=1, | |||
| downsample=None, | |||
| dilation=1): | |||
| super(Bottleneck, self).__init__() | |||
| self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) | |||
| self.bn1 = nn.BatchNorm2d(planes) | |||
| self.conv2 = nn.Conv2d( | |||
| planes, | |||
| planes, | |||
| kernel_size=3, | |||
| stride=stride, | |||
| padding=(3 * dilation - 1) // 2, | |||
| bias=False, | |||
| dilation=dilation) | |||
| self.bn2 = nn.BatchNorm2d(planes) | |||
| self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) | |||
| self.bn3 = nn.BatchNorm2d(planes * 4) | |||
| self.downsample = downsample | |||
| def forward(self, x): | |||
| out = F.relu(self.bn1(self.conv1(x)), inplace=True) | |||
| out = F.relu(self.bn2(self.conv2(out)), inplace=True) | |||
| out = self.bn3(self.conv3(out)) | |||
| if self.downsample is not None: | |||
| x = self.downsample(x) | |||
| return F.relu(out + x, inplace=True) | |||
| class ResNet(nn.Module): | |||
| def __init__(self): | |||
| super(ResNet, self).__init__() | |||
| self.inplanes = 64 | |||
| self.conv1 = nn.Conv2d( | |||
| 3, 64, kernel_size=7, stride=2, padding=3, bias=False) | |||
| self.bn1 = nn.BatchNorm2d(64) | |||
| self.layer1 = self.make_layer(64, 3, stride=1, dilation=1) | |||
| self.layer2 = self.make_layer(128, 4, stride=2, dilation=1) | |||
| self.layer3 = self.make_layer(256, 6, stride=2, dilation=1) | |||
| self.layer4 = self.make_layer(512, 3, stride=2, dilation=1) | |||
| def make_layer(self, planes, blocks, stride, dilation): | |||
| downsample = nn.Sequential( | |||
| nn.Conv2d( | |||
| self.inplanes, | |||
| planes * 4, | |||
| kernel_size=1, | |||
| stride=stride, | |||
| bias=False), nn.BatchNorm2d(planes * 4)) | |||
| layers = [ | |||
| Bottleneck( | |||
| self.inplanes, planes, stride, downsample, dilation=dilation) | |||
| ] | |||
| self.inplanes = planes * 4 | |||
| for _ in range(1, blocks): | |||
| layers.append(Bottleneck(self.inplanes, planes, dilation=dilation)) | |||
| return nn.Sequential(*layers) | |||
| def forward(self, x): | |||
| x = x.reshape(1, 3, 448, 448) | |||
| out1 = F.relu(self.bn1(self.conv1(x)), inplace=True) | |||
| out1 = F.max_pool2d(out1, kernel_size=3, stride=2, padding=1) | |||
| out2 = self.layer1(out1) | |||
| out3 = self.layer2(out2) | |||
| out4 = self.layer3(out3) | |||
| out5 = self.layer4(out4) | |||
| return out2, out3, out4, out5 | |||
| class CFM(nn.Module): | |||
| def __init__(self): | |||
| super(CFM, self).__init__() | |||
| self.conv1h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn1h = nn.BatchNorm2d(64) | |||
| self.conv2h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn2h = nn.BatchNorm2d(64) | |||
| self.conv3h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn3h = nn.BatchNorm2d(64) | |||
| self.conv4h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn4h = nn.BatchNorm2d(64) | |||
| self.conv1v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn1v = nn.BatchNorm2d(64) | |||
| self.conv2v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn2v = nn.BatchNorm2d(64) | |||
| self.conv3v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn3v = nn.BatchNorm2d(64) | |||
| self.conv4v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1) | |||
| self.bn4v = nn.BatchNorm2d(64) | |||
| def forward(self, left, down): | |||
| if down.size()[2:] != left.size()[2:]: | |||
| down = F.interpolate(down, size=left.size()[2:], mode='bilinear') | |||
| out1h = F.relu(self.bn1h(self.conv1h(left)), inplace=True) | |||
| out2h = F.relu(self.bn2h(self.conv2h(out1h)), inplace=True) | |||
| out1v = F.relu(self.bn1v(self.conv1v(down)), inplace=True) | |||
| out2v = F.relu(self.bn2v(self.conv2v(out1v)), inplace=True) | |||
| fuse = out2h * out2v | |||
| out3h = F.relu(self.bn3h(self.conv3h(fuse)), inplace=True) + out1h | |||
| out4h = F.relu(self.bn4h(self.conv4h(out3h)), inplace=True) | |||
| out3v = F.relu(self.bn3v(self.conv3v(fuse)), inplace=True) + out1v | |||
| out4v = F.relu(self.bn4v(self.conv4v(out3v)), inplace=True) | |||
| return out4h, out4v | |||
| class Decoder(nn.Module): | |||
| def __init__(self): | |||
| super(Decoder, self).__init__() | |||
| self.cfm45 = CFM() | |||
| self.cfm34 = CFM() | |||
| self.cfm23 = CFM() | |||
| def forward(self, out2h, out3h, out4h, out5v, fback=None): | |||
| if fback is not None: | |||
| refine5 = F.interpolate( | |||
| fback, size=out5v.size()[2:], mode='bilinear') | |||
| refine4 = F.interpolate( | |||
| fback, size=out4h.size()[2:], mode='bilinear') | |||
| refine3 = F.interpolate( | |||
| fback, size=out3h.size()[2:], mode='bilinear') | |||
| refine2 = F.interpolate( | |||
| fback, size=out2h.size()[2:], mode='bilinear') | |||
| out5v = out5v + refine5 | |||
| out4h, out4v = self.cfm45(out4h + refine4, out5v) | |||
| out3h, out3v = self.cfm34(out3h + refine3, out4v) | |||
| out2h, pred = self.cfm23(out2h + refine2, out3v) | |||
| else: | |||
| out4h, out4v = self.cfm45(out4h, out5v) | |||
| out3h, out3v = self.cfm34(out3h, out4v) | |||
| out2h, pred = self.cfm23(out2h, out3v) | |||
| return out2h, out3h, out4h, out5v, pred | |||
| class F3Net(nn.Module): | |||
| def __init__(self): | |||
| super(F3Net, self).__init__() | |||
| self.bkbone = ResNet() | |||
| self.squeeze5 = nn.Sequential( | |||
| nn.Conv2d(2048, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||
| self.squeeze4 = nn.Sequential( | |||
| nn.Conv2d(1024, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||
| self.squeeze3 = nn.Sequential( | |||
| nn.Conv2d(512, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||
| self.squeeze2 = nn.Sequential( | |||
| nn.Conv2d(256, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) | |||
| self.decoder1 = Decoder() | |||
| self.decoder2 = Decoder() | |||
| self.linearp1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||
| self.linearp2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||
| self.linearr2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||
| self.linearr3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||
| self.linearr4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||
| self.linearr5 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1) | |||
| def forward(self, x, shape=None): | |||
| x = x.reshape(1, 3, 448, 448) | |||
| out2h, out3h, out4h, out5v = self.bkbone(x) | |||
| out2h, out3h, out4h, out5v = self.squeeze2(out2h), self.squeeze3( | |||
| out3h), self.squeeze4(out4h), self.squeeze5(out5v) | |||
| out2h, out3h, out4h, out5v, pred1 = self.decoder1( | |||
| out2h, out3h, out4h, out5v) | |||
| out2h, out3h, out4h, out5v, pred2 = self.decoder2( | |||
| out2h, out3h, out4h, out5v, pred1) | |||
| shape = x.size()[2:] if shape is None else shape | |||
| pred1 = F.interpolate( | |||
| self.linearp1(pred1), size=shape, mode='bilinear') | |||
| pred2 = F.interpolate( | |||
| self.linearp2(pred2), size=shape, mode='bilinear') | |||
| out2h = F.interpolate( | |||
| self.linearr2(out2h), size=shape, mode='bilinear') | |||
| out3h = F.interpolate( | |||
| self.linearr3(out3h), size=shape, mode='bilinear') | |||
| out4h = F.interpolate( | |||
| self.linearr4(out4h), size=shape, mode='bilinear') | |||
| out5h = F.interpolate( | |||
| self.linearr5(out5v), size=shape, mode='bilinear') | |||
| return pred1, pred2, out2h, out3h, out4h, out5h | |||
| @@ -0,0 +1,77 @@ | |||
| # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. | |||
| import cv2 | |||
| import numpy as np | |||
| import torch | |||
| from PIL import Image | |||
| from modelscope.metainfo import Models | |||
| from modelscope.models.base import TorchModel | |||
| from modelscope.models.builder import MODELS | |||
| from modelscope.utils.constant import ModelFile, Tasks | |||
| from modelscope.utils.logger import get_logger | |||
| from .net import F3Net | |||
| logger = get_logger() | |||
| def load_state_dict(model_dir, device): | |||
| _dict = torch.load( | |||
| '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), | |||
| map_location=device) | |||
| state_dict = {} | |||
| for k, v in _dict.items(): | |||
| if k.startswith('module'): | |||
| k = k[7:] | |||
| state_dict[k] = v | |||
| return state_dict | |||
| @MODELS.register_module( | |||
| Tasks.product_segmentation, module_name=Models.product_segmentation) | |||
| class F3NetForProductSegmentation(TorchModel): | |||
| def __init__(self, model_dir, device_id=0, *args, **kwargs): | |||
| super().__init__( | |||
| model_dir=model_dir, device_id=device_id, *args, **kwargs) | |||
| self.model = F3Net() | |||
| if torch.cuda.is_available(): | |||
| self.device = 'cuda' | |||
| logger.info('Use GPU') | |||
| else: | |||
| self.device = 'cpu' | |||
| logger.info('Use CPU') | |||
| self.params = load_state_dict(model_dir, self.device) | |||
| self.model.load_state_dict(self.params) | |||
| self.model.to(self.device) | |||
| self.model.eval() | |||
| self.model.to(self.device) | |||
| def forward(self, x): | |||
| pred_result = self.model(x) | |||
| return pred_result | |||
| mean, std = np.array([[[124.55, 118.90, | |||
| 102.94]]]), np.array([[[56.77, 55.97, 57.50]]]) | |||
| def inference(model, device, input_path): | |||
| img = Image.open(input_path) | |||
| img = np.array(img.convert('RGB')).astype(np.float32) | |||
| img = (img - mean) / std | |||
| img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR) | |||
| img = torch.from_numpy(img) | |||
| img = img.permute(2, 0, 1) | |||
| img = img.to(device).float() | |||
| outputs = model(img) | |||
| out = outputs[0] | |||
| pred = (torch.sigmoid(out[0, 0]) * 255).cpu().numpy() | |||
| pred[pred < 20] = 0 | |||
| pred = pred[:, :, np.newaxis] | |||
| pred = np.round(pred) | |||
| logger.info('Inference Done') | |||
| return pred | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import warnings | |||
| import torch | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import time | |||
| from typing import Dict, List, Optional, Tuple, Union | |||
| @@ -1,3 +1,4 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from BasicSR, made public available under the Apache 2.0 License | |||
| # at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/arch_util.py | |||
| import collections.abc | |||
| import math | |||
| import warnings | |||
| @@ -1,3 +1,5 @@ | |||
| # The implementation is adopted from BasicSR, made public available under the Apache 2.0 License | |||
| # at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/rrdbnet_arch.py | |||
| import torch | |||
| from torch import nn as nn | |||
| from torch.nn import functional as F | |||
| @@ -1 +1,3 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .model import CLIPForMultiModalEmbedding | |||
| @@ -1,3 +1,18 @@ | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| import os | |||
| from collections import OrderedDict | |||
| from typing import Any, Dict, Iterable, List, Tuple, Union | |||
| @@ -543,6 +543,7 @@ class GEMMModel(nn.Module): | |||
| img_feature, text_feature, caption = None, None, None | |||
| if captioning and image is not None: | |||
| img_feature, caption = self.model.image_to_text(image) | |||
| img_feature = self.parse_feat(img_feature) | |||
| elif image is not None: | |||
| img_feature = self.parse_feat(self.model.encode_image(image)) | |||
| if text is not None: | |||
| @@ -67,7 +67,7 @@ class GEMMForMultiModalEmbedding(TorchModel): | |||
| return img_tensor | |||
| def parse_text(self, text_str): | |||
| if text_str is None: | |||
| if text_str is None or len(text_str) == 0: | |||
| return None | |||
| if isinstance(text_str, str): | |||
| text_ids_tensor = self.gemm_model.tokenize(text_str) | |||
| @@ -79,9 +79,12 @@ class GEMMForMultiModalEmbedding(TorchModel): | |||
| return text_ids_tensor.view(1, -1) | |||
| def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: | |||
| image = self.parse_image(input.get('image', input.get('img', None))) | |||
| text = self.parse_text(input.get('text', input.get('txt', None))) | |||
| captioning = input.get('captioning', False) is True | |||
| image_input = input.get('image', input.get('img', None)) | |||
| text_input = input.get('text', input.get('txt', None)) | |||
| captioning_input = input.get('captioning', None) | |||
| image = self.parse_image(image_input) | |||
| text = self.parse_text(text_input) | |||
| captioning = captioning_input is True or text_input == '' | |||
| out = self.gemm_model(image, text, captioning) | |||
| output = { | |||
| OutputKeys.IMG_EMBEDDING: out.get('image_feature', None), | |||
| @@ -1,4 +1,4 @@ | |||
| # The implementation is adopated from the CLIP4Clip implementation, | |||
| # The implementation is adopted from the CLIP4Clip implementation, | |||
| # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | |||
| import random | |||
| @@ -1,4 +1,4 @@ | |||
| # The implementation is adopated from the CLIP4Clip implementation, | |||
| # The implementation is adopted from the CLIP4Clip implementation, | |||
| # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | |||
| import numpy as np | |||
| @@ -1,4 +1,4 @@ | |||
| # The implementation is adopated from the CLIP4Clip implementation, | |||
| # The implementation is adopted from the CLIP4Clip implementation, | |||
| # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip | |||
| import gzip | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel | |||
| from .tokenization_ofa import OFATokenizer, OFATokenizerZH | |||
| from .tokenization_ofa_fast import OFATokenizerFast, OFATokenizerZHFast | |||
| @@ -1,3 +1,17 @@ | |||
| # Copyright 2022 OFA-Sys Team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| import torch | |||
| import torch.nn as nn | |||
| @@ -1 +1,2 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| from .constant import OFA_TASK_KEY_MAPPING | |||
| @@ -1,3 +1,5 @@ | |||
| # Copyright (c) Alibaba, Inc. and its affiliates. | |||
| import os | |||
| from typing import Any, Dict | |||
| @@ -21,6 +21,7 @@ class OutputKeys(object): | |||
| POLYGONS = 'polygons' | |||
| OUTPUT = 'output' | |||
| OUTPUT_IMG = 'output_img' | |||
| OUTPUT_VIDEO = 'output_video' | |||
| OUTPUT_PCM = 'output_pcm' | |||
| IMG_EMBEDDING = 'img_embedding' | |||
| SPO_LIST = 'spo_list' | |||
| @@ -37,8 +38,10 @@ class OutputKeys(object): | |||
| KWS_LIST = 'kws_list' | |||
| HISTORY = 'history' | |||
| TIMESTAMPS = 'timestamps' | |||
| SPLIT_VIDEO_NUM = 'split_video_num' | |||
| SPLIT_META_LIST = 'split_meta_list' | |||
| SHOT_NUM = 'shot_num' | |||
| SCENE_NUM = 'scene_num' | |||
| SCENE_META_LIST = 'scene_meta_list' | |||
| SHOT_META_LIST = 'shot_meta_list' | |||
| TASK_OUTPUTS = { | |||
| @@ -218,13 +221,21 @@ TASK_OUTPUTS = { | |||
| # 3D human body keypoints detection result for single sample | |||
| # { | |||
| # "poses": [ | |||
| # [[x, y, z]*17], | |||
| # [[x, y, z]*17], | |||
| # [[x, y, z]*17] | |||
| # ] | |||
| # "poses": [ # 3d pose coordinate in camera coordinate | |||
| # [[x, y, z]*17], # joints of per image | |||
| # [[x, y, z]*17], | |||
| # ... | |||
| # ], | |||
| # "timestamps": [ # timestamps of all frames | |||
| # "00:00:0.230", | |||
| # "00:00:0.560", | |||
| # "00:00:0.690", | |||
| # ], | |||
| # "output_video": "path_to_rendered_video" , this is optional | |||
| # and is only avaialbe when the "render" option is enabled. | |||
| # } | |||
| Tasks.body_3d_keypoints: [OutputKeys.POSES], | |||
| Tasks.body_3d_keypoints: | |||
| [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO], | |||
| # 2D hand keypoints result for single sample | |||
| # { | |||
| @@ -300,19 +311,30 @@ TASK_OUTPUTS = { | |||
| Tasks.shop_segmentation: [OutputKeys.MASKS], | |||
| # movide scene segmentation result for a single video | |||
| # { | |||
| # "split_video_num":3, | |||
| # "split_meta_list": | |||
| # "shot_num":15, | |||
| # "shot_meta_list": | |||
| # [ | |||
| # { | |||
| # "frame": [start_frame, end_frame], | |||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
| # | |||
| # } | |||
| # ] | |||
| # "scene_num":3, | |||
| # "scene_meta_list": | |||
| # [ | |||
| # { | |||
| # "shot": [0,1,2], | |||
| # "frame": [start_frame, end_frame], | |||
| # "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
| # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245'] | |||
| # } | |||
| # ] | |||
| # | |||
| # } | |||
| Tasks.movie_scene_segmentation: | |||
| [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST], | |||
| Tasks.movie_scene_segmentation: [ | |||
| OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM, | |||
| OutputKeys.SCENE_META_LIST | |||
| ], | |||
| # ============ nlp tasks =================== | |||
| @@ -649,8 +671,28 @@ TASK_OUTPUTS = { | |||
| # 'output': ['Done' / 'Decode_Error'] | |||
| # } | |||
| Tasks.video_inpainting: [OutputKeys.OUTPUT], | |||
| # { | |||
| # 'output': ['bixin'] | |||
| # } | |||
| Tasks.hand_static: [OutputKeys.OUTPUT] | |||
| Tasks.hand_static: [OutputKeys.OUTPUT], | |||
| # 'output': [ | |||
| # [2, 75, 287, 240, 510, 0.8335018754005432], | |||
| # [1, 127, 83, 332, 366, 0.9175254702568054], | |||
| # [0, 0, 0, 367, 639, 0.9693422317504883]] | |||
| # } | |||
| Tasks.face_human_hand_detection: [OutputKeys.OUTPUT], | |||
| # { | |||
| # {'output': 'Happiness', 'boxes': (203, 104, 663, 564)} | |||
| # } | |||
| Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES], | |||
| # { | |||
| # "masks": [ | |||
| # np.array # 2D array containing only 0, 255 | |||
| # ] | |||
| # } | |||
| Tasks.product_segmentation: [OutputKeys.MASKS], | |||
| } | |||