Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts: # modelscope/models/multi_modal/ofa/utils/__init__.py
3 years ago · fcbcd8e1b6
--- a/data/test/images/face_emotion.jpg
+++ b/data/test/images/face_emotion.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:712b5525e37080d33f62d6657609dbef20e843ccc04ee5c788ea11aa7c08545e
 size 123341
--- a/data/test/images/face_human_hand_detection.jpg
+++ b/data/test/images/face_human_hand_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:8fddc7be8381eb244cd692601f1c1e6cf3484b44bb4e73df0bc7de29352eb487
 size 23889
--- a/data/test/images/product_segmentation.jpg
+++ b/data/test/images/product_segmentation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:a16038f7809127eb3e03cbae049592d193707e095309daca78f7d108d67fe4ec
 size 108357
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -40,6 +40,9 @@ class Models(object):
    ulfd = 'ulfd'
    video_inpainting = 'video-inpainting'
    hand_static = 'hand-static'
    face_human_hand_detection = 'face-human-hand-detection'
    face_emotion = 'face-emotion'
    product_segmentation = 'product-segmentation'

    # EasyCV models
    yolox = 'YOLOX'
@@ -179,9 +182,16 @@ class Pipelines(object):
    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
    shop_segmentation = 'shop-segmentation'
    video_inpainting = 'video-inpainting'
    pst_action_recognition = 'patchshift-action-recognition'
    hand_static = 'hand-static'
    face_human_hand_detection = 'face-human-hand-detection'
    face_emotion = 'face-emotion'
    product_segmentation = 'product-segmentation'

    # nlp tasks
    automatic_post_editing = 'automatic-post-editing'
    translation_quality_estimation = 'translation-quality-estimation'
    domain_classification = 'domain-classification'
    sentence_similarity = 'sentence-similarity'
    word_segmentation = 'word-segmentation'
    part_of_speech = 'part-of-speech'
--- a/modelscope/metrics/image_portrait_enhancement_metric.py
+++ b/modelscope/metrics/image_portrait_enhancement_metric.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from BasicSR, publicly available at
 # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
 from typing import Dict

 import numpy as np
--- a/modelscope/models/cv/action_recognition/init.py
+++ b/modelscope/models/cv/action_recognition/init.py
@@ -7,11 +7,13 @@ if TYPE_CHECKING:

    from .models import BaseVideoModel
    from .tada_convnext import TadaConvNeXt
    from .temporal_patch_shift_transformer import PatchShiftTransformer

 else:
    _import_structure = {
        'models': ['BaseVideoModel'],
        'tada_convnext': ['TadaConvNeXt'],
        'temporal_patch_shift_transformer': ['PatchShiftTransformer']
    }

    import sys
--- a/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
+++ b/modelscope/models/cv/action_recognition/temporal_patch_shift_transformer.py
--- a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
+++ b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
@@ -1,3 +1,5 @@
 # The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.

 import os

 import numpy as np
--- a/modelscope/models/cv/body_2d_keypoints/w48.py
+++ b/modelscope/models/cv/body_2d_keypoints/w48.py
@@ -1,3 +1,5 @@
 # The implementation is based on HRNET, available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation.

 cfg_128x128_15 = {
    'DATASET': {
        'TYPE': 'DAMO',
--- a/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/body_3d_pose.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import logging
 import os.path as osp
 from typing import Any, Dict, List, Union
--- a/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose_modules.py
@@ -1,4 +1,4 @@
 # The implementation is based on OSTrack, available at https://github.com/facebookresearch/VideoPose3D
 # The implementation is based on VideoPose3D, available at https://github.com/facebookresearch/VideoPose3D
 import torch
 import torch.nn as nn

--- a/modelscope/models/cv/cartoon/facelib/LK/lk.py
+++ b/modelscope/models/cv/cartoon/facelib/LK/lk.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

 import numpy as np

 from modelscope.models.cv.cartoon.facelib.config import config as cfg
--- a/modelscope/models/cv/cartoon/facelib/config.py
+++ b/modelscope/models/cv/cartoon/facelib/config.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

 import os

 import numpy as np
--- a/modelscope/models/cv/cartoon/facelib/face_detector.py
+++ b/modelscope/models/cv/cartoon/facelib/face_detector.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

 import time

 import cv2
--- a/modelscope/models/cv/cartoon/facelib/face_landmark.py
+++ b/modelscope/models/cv/cartoon/facelib/face_landmark.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

 import cv2
 import numpy as np
 import tensorflow as tf
--- a/modelscope/models/cv/cartoon/facelib/facer.py
+++ b/modelscope/models/cv/cartoon/facelib/facer.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from https://github.com/610265158/Peppa_Pig_Face_Engine

 import time

 import cv2
--- a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
+++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/align_trans.py
@@ -1,7 +1,5 @@
 """
 Created on Mon Apr 24 15:43:29 2017
@author: zhaoy
 """
 # The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch

 import cv2
 import numpy as np

--- a/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
+++ b/modelscope/models/cv/cartoon/mtcnn_pytorch/src/matlab_cp2tform.py
@@ -1,8 +1,4 @@
 """
 Created on Tue Jul 11 06:54:28 2017

@author: zhaoyafei
 """
 # The implementation is adopted from https://github.com/TreB1eN/InsightFace_Pytorch/tree/master/mtcnn_pytorch

 import numpy as np
 from numpy.linalg import inv, lstsq
--- a/modelscope/models/cv/cartoon/utils.py
+++ b/modelscope/models/cv/cartoon/utils.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os

 import cv2
--- a/modelscope/models/cv/face_detection/mogface/init.py
+++ b/modelscope/models/cv/face_detection/mogface/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .models.detectors import MogFaceDetector
--- a/modelscope/models/cv/face_detection/mtcnn/init.py
+++ b/modelscope/models/cv/face_detection/mtcnn/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .models.detector import MtcnnFaceDetector
--- a/modelscope/models/cv/face_detection/retinaface/init.py
+++ b/modelscope/models/cv/face_detection/retinaface/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .detection import RetinaFaceDetection
--- a/modelscope/models/cv/face_detection/ulfd_slim/init.py
+++ b/modelscope/models/cv/face_detection/ulfd_slim/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .detection import UlfdFaceDetector
--- a/modelscope/models/cv/face_emotion/init.py
+++ b/modelscope/models/cv/face_emotion/init.py
@@ -0,0 +1,20 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .emotion_model import EfficientNetForFaceEmotion

 else:
    _import_structure = {'emotion_model': ['EfficientNetForFaceEmotion']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/face_emotion/efficient/init.py
+++ b/modelscope/models/cv/face_emotion/efficient/init.py
@@ -0,0 +1,6 @@
 # The implementation here is modified based on EfficientNet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch

 from .model import VALID_MODELS, EfficientNet
 from .utils import (BlockArgs, BlockDecoder, GlobalParams, efficientnet,
                    get_model_params)
--- a/modelscope/models/cv/face_emotion/efficient/model.py
+++ b/modelscope/models/cv/face_emotion/efficient/model.py
@@ -0,0 +1,380 @@
 # The implementation here is modified based on EfficientNet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch

 import torch
 from torch import nn
 from torch.nn import functional as F

 from .utils import (MemoryEfficientSwish, Swish, calculate_output_image_size,
                    drop_connect, efficientnet_params, get_model_params,
                    get_same_padding_conv2d, load_pretrained_weights,
                    round_filters, round_repeats)

 VALID_MODELS = ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2',
                'efficientnet-b3', 'efficientnet-b4', 'efficientnet-b5',
                'efficientnet-b6', 'efficientnet-b7', 'efficientnet-b8',
                'efficientnet-l2')


 class MBConvBlock(nn.Module):

    def __init__(self, block_args, global_params, image_size=None):
        super().__init__()
        self._block_args = block_args
        self._bn_mom = 1 - global_params.batch_norm_momentum
        self._bn_eps = global_params.batch_norm_epsilon
        self.has_se = (self._block_args.se_ratio
                       is not None) and (0 < self._block_args.se_ratio <= 1)
        self.id_skip = block_args.id_skip

        inp = self._block_args.input_filters
        oup = self._block_args.input_filters * self._block_args.expand_ratio
        if self._block_args.expand_ratio != 1:
            Conv2d = get_same_padding_conv2d(image_size=image_size)
            self._expand_conv = Conv2d(
                in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
            self._bn0 = nn.BatchNorm2d(
                num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)

        k = self._block_args.kernel_size
        s = self._block_args.stride
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._depthwise_conv = Conv2d(
            in_channels=oup,
            out_channels=oup,
            groups=oup,
            kernel_size=k,
            stride=s,
            bias=False)
        self._bn1 = nn.BatchNorm2d(
            num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
        image_size = calculate_output_image_size(image_size, s)

        if self.has_se:
            Conv2d = get_same_padding_conv2d(image_size=(1, 1))
            num_squeezed_channels = max(
                1,
                int(self._block_args.input_filters
                    * self._block_args.se_ratio))
            self._se_reduce = Conv2d(
                in_channels=oup,
                out_channels=num_squeezed_channels,
                kernel_size=1)
            self._se_expand = Conv2d(
                in_channels=num_squeezed_channels,
                out_channels=oup,
                kernel_size=1)

        final_oup = self._block_args.output_filters
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._project_conv = Conv2d(
            in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
        self._bn2 = nn.BatchNorm2d(
            num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
        self._swish = MemoryEfficientSwish()

    def forward(self, inputs, drop_connect_rate=None):
        """MBConvBlock's forward function.
        Args:
            inputs (tensor): Input tensor.
            drop_connect_rate (bool): Drop connect rate (float, between 0 and 1).
        Returns:
            Output of this block after processing.
        """

        x = inputs
        if self._block_args.expand_ratio != 1:
            x = self._expand_conv(inputs)
            x = self._bn0(x)
            x = self._swish(x)

        x = self._depthwise_conv(x)
        x = self._bn1(x)
        x = self._swish(x)

        if self.has_se:
            x_squeezed = F.adaptive_avg_pool2d(x, 1)
            x_squeezed = self._se_reduce(x_squeezed)
            x_squeezed = self._swish(x_squeezed)
            x_squeezed = self._se_expand(x_squeezed)
            x = torch.sigmoid(x_squeezed) * x

        x = self._project_conv(x)
        x = self._bn2(x)

        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
            if drop_connect_rate:
                x = drop_connect(
                    x, p=drop_connect_rate, training=self.training)
            x = x + inputs
        return x

    def set_swish(self, memory_efficient=True):
        """Sets swish function as memory efficient (for training) or standard (for export).
        Args:
            memory_efficient (bool): Whether to use memory-efficient version of swish.
        """
        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()


 class EfficientNet(nn.Module):
    """EfficientNet model.
       Most easily loaded with the .from_name or .from_pretrained methods.
    Args:
        blocks_args (list[namedtuple]): A list of BlockArgs to construct blocks.
        global_params (namedtuple): A set of GlobalParams shared between blocks.
    References:
        [1] https://arxiv.org/abs/1905.11946 (EfficientNet)
    Example:
        >>> import torch
        >>> from efficientnet.model import EfficientNet
        >>> inputs = torch.rand(1, 3, 224, 224)
        >>> model = EfficientNet.from_pretrained('efficientnet-b0')
        >>> model.eval()
        >>> outputs = model(inputs)
    """

    def __init__(self, blocks_args=None, global_params=None):
        super().__init__()
        assert isinstance(blocks_args, list), 'blocks_args should be a list'
        assert len(blocks_args) > 0, 'block args must be greater than 0'
        self._global_params = global_params
        self._blocks_args = blocks_args

        bn_mom = 1 - self._global_params.batch_norm_momentum
        bn_eps = self._global_params.batch_norm_epsilon
        image_size = global_params.image_size
        Conv2d = get_same_padding_conv2d(image_size=image_size)

        in_channels = 3
        out_channels = round_filters(32, self._global_params)
        self._conv_stem = Conv2d(
            in_channels, out_channels, kernel_size=3, stride=2, bias=False)
        self._bn0 = nn.BatchNorm2d(
            num_features=out_channels, momentum=bn_mom, eps=bn_eps)
        image_size = calculate_output_image_size(image_size, 2)

        self._blocks = nn.ModuleList([])
        for block_args in self._blocks_args:

            block_args = block_args._replace(
                input_filters=round_filters(block_args.input_filters,
                                            self._global_params),
                output_filters=round_filters(block_args.output_filters,
                                             self._global_params),
                num_repeat=round_repeats(block_args.num_repeat,
                                         self._global_params))

            self._blocks.append(
                MBConvBlock(
                    block_args, self._global_params, image_size=image_size))
            image_size = calculate_output_image_size(image_size,
                                                     block_args.stride)
            if block_args.num_repeat > 1:
                block_args = block_args._replace(
                    input_filters=block_args.output_filters, stride=1)
            for _ in range(block_args.num_repeat - 1):
                self._blocks.append(
                    MBConvBlock(
                        block_args, self._global_params,
                        image_size=image_size))

        in_channels = block_args.output_filters
        out_channels = round_filters(1280, self._global_params)
        Conv2d = get_same_padding_conv2d(image_size=image_size)
        self._conv_head = Conv2d(
            in_channels, out_channels, kernel_size=1, bias=False)
        self._bn1 = nn.BatchNorm2d(
            num_features=out_channels, momentum=bn_mom, eps=bn_eps)

        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
        if self._global_params.include_top:
            self._dropout = nn.Dropout(self._global_params.dropout_rate)
            self._fc = nn.Linear(out_channels, self._global_params.num_classes)

        self._swish = MemoryEfficientSwish()

    def set_swish(self, memory_efficient=True):
        """Sets swish function as memory efficient (for training) or standard (for export).
        Args:
            memory_efficient (bool): Whether to use memory-efficient version of swish.
        """
        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
        for block in self._blocks:
            block.set_swish(memory_efficient)

    def extract_endpoints(self, inputs):
        """Use convolution layer to extract features
        from reduction levels i in [1, 2, 3, 4, 5].
        Args:
            inputs (tensor): Input tensor.
        Returns:
            Dictionary of last intermediate features
            with reduction levels i in [1, 2, 3, 4, 5].
            Example:
                >>> import torch
                >>> from efficientnet.model import EfficientNet
                >>> inputs = torch.rand(1, 3, 224, 224)
                >>> model = EfficientNet.from_pretrained('efficientnet-b0')
                >>> endpoints = model.extract_endpoints(inputs)
                >>> print(endpoints['reduction_1'].shape)  # torch.Size([1, 16, 112, 112])
                >>> print(endpoints['reduction_2'].shape)  # torch.Size([1, 24, 56, 56])
                >>> print(endpoints['reduction_3'].shape)  # torch.Size([1, 40, 28, 28])
                >>> print(endpoints['reduction_4'].shape)  # torch.Size([1, 112, 14, 14])
                >>> print(endpoints['reduction_5'].shape)  # torch.Size([1, 320, 7, 7])
                >>> print(endpoints['reduction_6'].shape)  # torch.Size([1, 1280, 7, 7])
        """
        endpoints = dict()

        x = self._swish(self._bn0(self._conv_stem(inputs)))
        prev_x = x

        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(
                    self._blocks)  # scale drop connect_rate
            x = block(x, drop_connect_rate=drop_connect_rate)
            if prev_x.size(2) > x.size(2):
                endpoints['reduction_{}'.format(len(endpoints) + 1)] = prev_x
            elif idx == len(self._blocks) - 1:
                endpoints['reduction_{}'.format(len(endpoints) + 1)] = x
            prev_x = x

        x = self._swish(self._bn1(self._conv_head(x)))
        endpoints['reduction_{}'.format(len(endpoints) + 1)] = x

        return endpoints

    def extract_features(self, inputs):
        """use convolution layer to extract feature .
        Args:
            inputs (tensor): Input tensor.
        Returns:
            Output of the final convolution
            layer in the efficientnet model.
        """
        x = self._swish(self._bn0(self._conv_stem(inputs)))

        for idx, block in enumerate(self._blocks):
            drop_connect_rate = self._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self._blocks)
            x = block(x, drop_connect_rate=drop_connect_rate)
        x = self._swish(self._bn1(self._conv_head(x)))

        return x

    def forward(self, inputs):
        """EfficientNet's forward function.
           Calls extract_features to extract features, applies final linear layer, and returns logits.
        Args:
            inputs (tensor): Input tensor.
        Returns:
            Output of this model after processing.
        """
        x = self.extract_features(inputs)
        x = self._avg_pooling(x)
        if self._global_params.include_top:
            x = x.flatten(start_dim=1)
            x = self._dropout(x)
            x = self._fc(x)
        return x

    @classmethod
    def from_name(cls, model_name, in_channels=3, **override_params):
        """Create an efficientnet model according to name.
        Args:
            model_name (str): Name for efficientnet.
            in_channels (int): Input data's channel number.
            override_params (other key word params):
                Params to override model's global_params.
                Optional key:
                    'width_coefficient', 'depth_coefficient',
                    'image_size', 'dropout_rate',
                    'num_classes', 'batch_norm_momentum',
                    'batch_norm_epsilon', 'drop_connect_rate',
                    'depth_divisor', 'min_depth'
        Returns:
            An efficientnet model.
        """
        cls._check_model_name_is_valid(model_name)
        blocks_args, global_params = get_model_params(model_name,
                                                      override_params)
        model = cls(blocks_args, global_params)
        model._change_in_channels(in_channels)
        return model

    @classmethod
    def from_pretrained(cls,
                        model_name,
                        weights_path=None,
                        advprop=False,
                        in_channels=3,
                        num_classes=1000,
                        **override_params):
        """Create an efficientnet model according to name.
        Args:
            model_name (str): Name for efficientnet.
            weights_path (None or str):
                str: path to pretrained weights file on the local disk.
                None: use pretrained weights downloaded from the Internet.
            advprop (bool):
                Whether to load pretrained weights
                trained with advprop (valid when weights_path is None).
            in_channels (int): Input data's channel number.
            num_classes (int):
                Number of categories for classification.
                It controls the output size for final linear layer.
            override_params (other key word params):
                Params to override model's global_params.
                Optional key:
                    'width_coefficient', 'depth_coefficient',
                    'image_size', 'dropout_rate',
                    'batch_norm_momentum',
                    'batch_norm_epsilon', 'drop_connect_rate',
                    'depth_divisor', 'min_depth'
        Returns:
            A pretrained efficientnet model.
        """
        model = cls.from_name(
            model_name, num_classes=num_classes, **override_params)
        model._change_in_channels(in_channels)
        return model

    @classmethod
    def get_image_size(cls, model_name):
        """Get the input image size for a given efficientnet model.
        Args:
            model_name (str): Name for efficientnet.
        Returns:
            Input image size (resolution).
        """
        cls._check_model_name_is_valid(model_name)
        _, _, res, _ = efficientnet_params(model_name)
        return res

    @classmethod
    def _check_model_name_is_valid(cls, model_name):
        """Validates model name.
        Args:
            model_name (str): Name for efficientnet.
        Returns:
            bool: Is a valid name or not.
        """
        if model_name not in VALID_MODELS:
            raise ValueError('model_name should be one of: '
                             + ', '.join(VALID_MODELS))

    def _change_in_channels(self, in_channels):
        """Adjust model's first convolution layer to in_channels, if in_channels not equals 3.
        Args:
            in_channels (int): Input data's channel number.
        """
        if in_channels != 3:
            Conv2d = get_same_padding_conv2d(
                image_size=self._global_params.image_size)
            out_channels = round_filters(32, self._global_params)
            self._conv_stem = Conv2d(
                in_channels, out_channels, kernel_size=3, stride=2, bias=False)
--- a/modelscope/models/cv/face_emotion/efficient/utils.py
+++ b/modelscope/models/cv/face_emotion/efficient/utils.py
@@ -0,0 +1,559 @@
 # The implementation here is modified based on EfficientNet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/lukemelas/EfficientNet-PyTorch

 import collections
 import math
 import re
 from functools import partial

 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.utils import model_zoo

 GlobalParams = collections.namedtuple('GlobalParams', [
    'width_coefficient', 'depth_coefficient', 'image_size', 'dropout_rate',
    'num_classes', 'batch_norm_momentum', 'batch_norm_epsilon',
    'drop_connect_rate', 'depth_divisor', 'min_depth', 'include_top'
 ])

 BlockArgs = collections.namedtuple('BlockArgs', [
    'num_repeat', 'kernel_size', 'stride', 'expand_ratio', 'input_filters',
    'output_filters', 'se_ratio', 'id_skip'
 ])

 GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
 BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)

 if hasattr(nn, 'SiLU'):
    Swish = nn.SiLU
 else:

    class Swish(nn.Module):

        def forward(self, x):
            return x * torch.sigmoid(x)


 class SwishImplementation(torch.autograd.Function):

    @staticmethod
    def forward(ctx, i):
        result = i * torch.sigmoid(i)
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_tensors[0]
        sigmoid_i = torch.sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))


 class MemoryEfficientSwish(nn.Module):

    def forward(self, x):
        return SwishImplementation.apply(x)


 def round_filters(filters, global_params):
    """Calculate and round number of filters based on width multiplier.
       Use width_coefficient, depth_divisor and min_depth of global_params.
    Args:
        filters (int): Filters number to be calculated.
        global_params (namedtuple): Global params of the model.
    Returns:
        new_filters: New filters number after calculating.
    """
    multiplier = global_params.width_coefficient
    if not multiplier:
        return filters

    divisor = global_params.depth_divisor
    min_depth = global_params.min_depth
    filters *= multiplier
    min_depth = min_depth or divisor
    new_filters = max(min_depth,
                      int(filters + divisor / 2) // divisor * divisor)
    if new_filters < 0.9 * filters:
        new_filters += divisor
    return int(new_filters)


 def round_repeats(repeats, global_params):
    """Calculate module's repeat number of a block based on depth multiplier.
       Use depth_coefficient of global_params.
    Args:
        repeats (int): num_repeat to be calculated.
        global_params (namedtuple): Global params of the model.
    Returns:
        new repeat: New repeat number after calculating.
    """
    multiplier = global_params.depth_coefficient
    if not multiplier:
        return repeats
    return int(math.ceil(multiplier * repeats))


 def drop_connect(inputs, p, training):
    """Drop connect.
    Args:
        input (tensor: BCWH): Input of this structure.
        p (float: 0.0~1.0): Probability of drop connection.
        training (bool): The running mode.
    Returns:
        output: Output after drop connection.
    """
    assert 0 <= p <= 1, 'p must be in range of [0,1]'

    if not training:
        return inputs

    batch_size = inputs.shape[0]
    keep_prob = 1 - p

    random_tensor = keep_prob
    random_tensor += torch.rand([batch_size, 1, 1, 1],
                                dtype=inputs.dtype,
                                device=inputs.device)
    binary_tensor = torch.floor(random_tensor)

    output = inputs / keep_prob * binary_tensor
    return output


 def get_width_and_height_from_size(x):
    """Obtain height and width from x.
    Args:
        x (int, tuple or list): Data size.
    Returns:
        size: A tuple or list (H,W).
    """
    if isinstance(x, int):
        return x, x
    if isinstance(x, list) or isinstance(x, tuple):
        return x
    else:
        raise TypeError()


 def calculate_output_image_size(input_image_size, stride):
    """Calculates the output image size when using Conv2dSamePadding with a stride.
       Necessary for static padding. Thanks to mannatsingh for pointing this out.
    Args:
        input_image_size (int, tuple or list): Size of input image.
        stride (int, tuple or list): Conv2d operation's stride.
    Returns:
        output_image_size: A list [H,W].
    """
    if input_image_size is None:
        return None
    image_height, image_width = get_width_and_height_from_size(
        input_image_size)
    stride = stride if isinstance(stride, int) else stride[0]
    image_height = int(math.ceil(image_height / stride))
    image_width = int(math.ceil(image_width / stride))
    return [image_height, image_width]


 def get_same_padding_conv2d(image_size=None):
    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
       Static padding is necessary for ONNX exporting of models.
    Args:
        image_size (int or tuple): Size of the image.
    Returns:
        Conv2dDynamicSamePadding or Conv2dStaticSamePadding.
    """
    if image_size is None:
        return Conv2dDynamicSamePadding
    else:
        return partial(Conv2dStaticSamePadding, image_size=image_size)


 class Conv2dDynamicSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow, for a dynamic image size.
       The padding is operated in forward function by calculating dynamically.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 dilation=1,
                 groups=1,
                 bias=True):
        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
                         dilation, groups, bias)
        self.stride = self.stride if len(
            self.stride) == 2 else [self.stride[0]] * 2

    def forward(self, x):
        ih, iw = x.size()[-2:]
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        a1 = (oh - 1) * self.stride[0]
        pad_h = max(a1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        a2 = (ow - 1) * self.stride[1]
        pad_w = max(a2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [
                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
            ])
        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
                        self.dilation, self.groups)


 class Conv2dStaticSamePadding(nn.Conv2d):
    """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
       The padding mudule is calculated in construction function, then used in forward.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 image_size=None,
                 **kwargs):
        super().__init__(in_channels, out_channels, kernel_size, stride,
                         **kwargs)
        self.stride = self.stride if len(
            self.stride) == 2 else [self.stride[0]] * 2

        assert image_size is not None
        ih, iw = (image_size,
                  image_size) if isinstance(image_size, int) else image_size
        kh, kw = self.weight.size()[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        b1 = (oh - 1) * self.stride[0]
        pad_h = max(b1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        b2 = (ow - 1) * self.stride[1]
        pad_w = max(b2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.ZeroPad2d(
                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
                 pad_h - pad_h // 2))
        else:
            self.static_padding = nn.Identity()

    def forward(self, x):
        x = self.static_padding(x)
        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
                     self.dilation, self.groups)
        return x


 def get_same_padding_maxPool2d(image_size=None):
    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
       Static padding is necessary for ONNX exporting of models.
    Args:
        image_size (int or tuple): Size of the image.
    Returns:
        MaxPool2dDynamicSamePadding or MaxPool2dStaticSamePadding.
    """
    if image_size is None:
        return MaxPool2dDynamicSamePadding
    else:
        return partial(MaxPool2dStaticSamePadding, image_size=image_size)


 class MaxPool2dDynamicSamePadding(nn.MaxPool2d):
    """2D MaxPooling like TensorFlow's 'SAME' mode, with a dynamic image size.
       The padding is operated in forward function by calculating dynamically.
    """

    def __init__(self,
                 kernel_size,
                 stride,
                 padding=0,
                 dilation=1,
                 return_indices=False,
                 ceil_mode=False):
        super().__init__(kernel_size, stride, padding, dilation,
                         return_indices, ceil_mode)
        self.stride = [self.stride] * 2 if isinstance(self.stride,
                                                      int) else self.stride
        self.kernel_size = [self.kernel_size] * 2 if isinstance(
            self.kernel_size, int) else self.kernel_size
        self.dilation = [self.dilation] * 2 if isinstance(
            self.dilation, int) else self.dilation

    def forward(self, x):
        ih, iw = x.size()[-2:]
        kh, kw = self.kernel_size
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        c1 = (oh - 1) * self.stride[0]
        pad_h = max(c1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        c2 = (ow - 1) * self.stride[1]
        pad_w = max(c2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [
                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
            ])
        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
                            self.dilation, self.ceil_mode, self.return_indices)


 class MaxPool2dStaticSamePadding(nn.MaxPool2d):
    """2D MaxPooling like TensorFlow's 'SAME' mode, with the given input image size.
       The padding mudule is calculated in construction function, then used in forward.
    """

    def __init__(self, kernel_size, stride, image_size=None, **kwargs):
        super().__init__(kernel_size, stride, **kwargs)
        self.stride = [self.stride] * 2 if isinstance(self.stride,
                                                      int) else self.stride
        self.kernel_size = [self.kernel_size] * 2 if isinstance(
            self.kernel_size, int) else self.kernel_size
        self.dilation = [self.dilation] * 2 if isinstance(
            self.dilation, int) else self.dilation

        assert image_size is not None
        ih, iw = (image_size,
                  image_size) if isinstance(image_size, int) else image_size
        kh, kw = self.kernel_size
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
        d1 = (oh - 1) * self.stride[0]
        pad_h = max(d1 + (kh - 1) * self.dilation[0] + 1 - ih, 0)
        d2 = (ow - 1) * self.stride[1]
        pad_w = max(d2 + (kw - 1) * self.dilation[1] + 1 - iw, 0)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.ZeroPad2d(
                (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
                 pad_h - pad_h // 2))
        else:
            self.static_padding = nn.Identity()

    def forward(self, x):
        x = self.static_padding(x)
        x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding,
                         self.dilation, self.ceil_mode, self.return_indices)
        return x


 class BlockDecoder(object):
    """Block Decoder for readability,
       straight from the official TensorFlow repository.
    """

    @staticmethod
    def _decode_block_string(block_string):
        """Get a block through a string notation of arguments.
        Args:
            block_string (str): A string notation of arguments.
                                Examples: 'r1_k3_s11_e1_i32_o16_se0.25_noskip'.
        Returns:
            BlockArgs: The namedtuple defined at the top of this file.
        """
        assert isinstance(block_string, str)

        ops = block_string.split('_')
        options = {}
        for op in ops:
            splits = re.split(r'(\d.*)', op)
            if len(splits) >= 2:
                key, value = splits[:2]
                options[key] = value

        # Check stride
        assert (('s' in options and len(options['s']) == 1)
                or (len(options['s']) == 2
                    and options['s'][0] == options['s'][1]))

        return BlockArgs(
            num_repeat=int(options['r']),
            kernel_size=int(options['k']),
            stride=[int(options['s'][0])],
            expand_ratio=int(options['e']),
            input_filters=int(options['i']),
            output_filters=int(options['o']),
            se_ratio=float(options['se']) if 'se' in options else None,
            id_skip=('noskip' not in block_string))

    @staticmethod
    def _encode_block_string(block):
        """Encode a block to a string.
        Args:
            block (namedtuple): A BlockArgs type argument.
        Returns:
            block_string: A String form of BlockArgs.
        """
        args = [
            'r%d' % block.num_repeat,
            'k%d' % block.kernel_size,
            's%d%d' % (block.strides[0], block.strides[1]),
            'e%s' % block.expand_ratio,
            'i%d' % block.input_filters,
            'o%d' % block.output_filters
        ]
        if 0 < block.se_ratio <= 1:
            args.append('se%s' % block.se_ratio)
        if block.id_skip is False:
            args.append('noskip')
        return '_'.join(args)

    @staticmethod
    def decode(string_list):
        """Decode a list of string notations to specify blocks inside the network.
        Args:
            string_list (list[str]): A list of strings, each string is a notation of block.
        Returns:
            blocks_args: A list of BlockArgs namedtuples of block args.
        """
        assert isinstance(string_list, list)
        blocks_args = []
        for block_string in string_list:
            blocks_args.append(BlockDecoder._decode_block_string(block_string))
        return blocks_args

    @staticmethod
    def encode(blocks_args):
        """Encode a list of BlockArgs to a list of strings.
        Args:
            blocks_args (list[namedtuples]): A list of BlockArgs namedtuples of block args.
        Returns:
            block_strings: A list of strings, each string is a notation of block.
        """
        block_strings = []
        for block in blocks_args:
            block_strings.append(BlockDecoder._encode_block_string(block))
        return block_strings


 def efficientnet_params(model_name):
    """Map EfficientNet model name to parameter coefficients.
    Args:
        model_name (str): Model name to be queried.
    Returns:
        params_dict[model_name]: A (width,depth,res,dropout) tuple.
    """
    params_dict = {
        'efficientnet-b0': (1.0, 1.0, 112, 0.2),
        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
    }
    return params_dict[model_name]


 def efficientnet(width_coefficient=None,
                 depth_coefficient=None,
                 image_size=None,
                 dropout_rate=0.2,
                 drop_connect_rate=0.2,
                 num_classes=1000,
                 include_top=True):
    """Create BlockArgs and GlobalParams for efficientnet model.
    Args:
        width_coefficient (float)
        depth_coefficient (float)
        image_size (int)
        dropout_rate (float)
        drop_connect_rate (float)
        num_classes (int)
        Meaning as the name suggests.
    Returns:
        blocks_args, global_params.
    """

    blocks_args = [
        'r1_k3_s11_e1_i32_o16_se0.25',
        'r2_k3_s22_e6_i16_o24_se0.25',
        'r2_k5_s22_e6_i24_o40_se0.25',
        'r3_k3_s22_e6_i40_o80_se0.25',
        'r3_k5_s11_e6_i80_o112_se0.25',
        'r4_k5_s22_e6_i112_o192_se0.25',
        'r1_k3_s11_e6_i192_o320_se0.25',
    ]
    blocks_args = BlockDecoder.decode(blocks_args)

    global_params = GlobalParams(
        width_coefficient=width_coefficient,
        depth_coefficient=depth_coefficient,
        image_size=image_size,
        dropout_rate=dropout_rate,
        num_classes=num_classes,
        batch_norm_momentum=0.99,
        batch_norm_epsilon=1e-3,
        drop_connect_rate=drop_connect_rate,
        depth_divisor=8,
        min_depth=None,
        include_top=include_top,
    )
    return blocks_args, global_params


 def get_model_params(model_name, override_params):
    """Get the block args and global params for a given model name.
    Args:
        model_name (str): Model's name.
        override_params (dict): A dict to modify global_params.
    Returns:
        blocks_args, global_params
    """
    if model_name.startswith('efficientnet'):
        w, d, s, p = efficientnet_params(model_name)
        blocks_args, global_params = efficientnet(
            width_coefficient=w,
            depth_coefficient=d,
            dropout_rate=p,
            image_size=s)
    else:
        raise NotImplementedError(
            'model name is not pre-defined: {}'.format(model_name))
    if override_params:
        global_params = global_params._replace(**override_params)
    return blocks_args, global_params


 def load_pretrained_weights(model,
                            model_name,
                            weights_path=None,
                            load_fc=True,
                            advprop=False,
                            verbose=True):
    """Loads pretrained weights from weights path or download using url.
    Args:
        model (Module): The whole model of efficientnet.
        model_name (str): Model name of efficientnet.
        weights_path (None or str):
            str: path to pretrained weights file on the local disk.
            None: use pretrained weights downloaded from the Internet.
        load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
        advprop (bool): Whether to load pretrained weights
                        trained with advprop (valid when weights_path is None).
    """
    if isinstance(weights_path, str):
        state_dict = torch.load(weights_path)
    else:
        url_map_ = url_map_advprop if advprop else url_map
        state_dict = model_zoo.load_url(url_map_[model_name])

    if load_fc:
        ret = model.load_state_dict(state_dict, strict=False)
        assert not ret.missing_keys, 'Missing keys when loading pretrained weights: {}'.format(
            ret.missing_keys)
    else:
        state_dict.pop('_fc.weight')
        state_dict.pop('_fc.bias')
        ret = model.load_state_dict(state_dict, strict=False)
        assert set(ret.missing_keys) == set([
            '_fc.weight', '_fc.bias'
        ]), 'Missing keys when loading pretrained weights: {}'.format(
            ret.missing_keys)
    assert not ret.unexpected_keys, 'Missing keys when loading pretrained weights: {}'.format(
        ret.unexpected_keys)

    if verbose:
        print('Loaded pretrained weights for {}'.format(model_name))
--- a/modelscope/models/cv/face_emotion/emotion_infer.py
+++ b/modelscope/models/cv/face_emotion/emotion_infer.py
@@ -0,0 +1,67 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import torch
 from PIL import Image
 from torch import nn
 from torchvision import transforms

 from modelscope.utils.logger import get_logger
 from .face_alignment.face_align import face_detection_PIL_v2

 logger = get_logger()


 def transform_PIL(img_pil):
    val_transforms = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return val_transforms(img_pil)


 index2AU = [1, 2, 4, 6, 7, 10, 12, 15, 23, 24, 25, 26]
 emotion_list = [
    'Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise'
 ]


 def inference(image_path, model, face_model, score_thre=0.5, GPU=0):
    image = Image.open(image_path).convert('RGB')

    face, bbox = face_detection_PIL_v2(image, face_model)
    if bbox is None:
        logger.warn('no face detected!')
        result = {'emotion_result': None, 'box': None}
        return result

    face = transform_PIL(face)
    face = face.unsqueeze(0)
    if torch.cuda.is_available():
        face = face.cuda(GPU)
    logits_AU, logits_emotion = model(face)
    logits_AU = torch.sigmoid(logits_AU)
    logits_emotion = nn.functional.softmax(logits_emotion, 1)

    _, index_list = logits_emotion.max(1)
    emotion_index = index_list[0].data.item()
    prob = logits_emotion[0][emotion_index]
    if prob > score_thre and emotion_index != 3:
        cur_emotion = emotion_list[emotion_index]
    else:
        cur_emotion = 'Neutral'

    logits_AU = logits_AU[0]
    au_ouput = torch.zeros_like(logits_AU)
    au_ouput[logits_AU >= score_thre] = 1
    au_ouput[logits_AU < score_thre] = 0

    au_ouput = au_ouput.int()

    cur_au_list = []
    for idx in range(au_ouput.shape[0]):
        if au_ouput[idx] == 1:
            au = index2AU[idx]
            cur_au_list.append(au)
    cur_au_list.sort()
    result = (cur_emotion, bbox)
    return result
--- a/modelscope/models/cv/face_emotion/emotion_model.py
+++ b/modelscope/models/cv/face_emotion/emotion_model.py
@@ -0,0 +1,96 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os
 import sys

 import torch
 import torch.nn.functional as F
 from torch import nn

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.face_emotion.efficient import EfficientNet
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@MODELS.register_module(Tasks.face_emotion, module_name=Models.face_emotion)
 class EfficientNetForFaceEmotion(TorchModel):

    def __init__(self, model_dir, device_id=0, *args, **kwargs):

        super().__init__(
            model_dir=model_dir, device_id=device_id, *args, **kwargs)
        self.model = FaceEmotionModel(
            name='efficientnet-b0', num_embed=512, num_au=12, num_emotion=7)

        if torch.cuda.is_available():
            self.device = 'cuda'
            logger.info('Use GPU')
        else:
            self.device = 'cpu'
            logger.info('Use CPU')
        pretrained_params = torch.load(
            '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
            map_location=self.device)

        state_dict = pretrained_params['model']
        new_state = {}
        for k, v in state_dict.items():
            if k.startswith('module.'):
                k = k[7:]
            new_state[k] = v

        self.model.load_state_dict(new_state)
        self.model.eval()
        self.model.to(self.device)

    def forward(self, x):
        logits_au, logits_emotion = self.model(x)
        return logits_au, logits_emotion


 class FaceEmotionModel(nn.Module):

    def __init__(self,
                 name='efficientnet-b0',
                 num_embed=512,
                 num_au=12,
                 num_emotion=7):
        super(FaceEmotionModel, self).__init__()
        self.backbone = EfficientNet.from_pretrained(
            name, weights_path=None, advprop=True)
        self.average_pool = nn.AdaptiveAvgPool2d(1)
        self.embed = nn.Linear(self.backbone._fc.weight.data.shape[1],
                               num_embed)
        self.features = nn.BatchNorm1d(num_embed)
        nn.init.constant_(self.features.weight, 1.0)
        self.features.weight.requires_grad = False
        self.fc_au = nn.Sequential(
            nn.Dropout(0.6),
            nn.Linear(num_embed, num_au),
        )
        self.fc_emotion = nn.Sequential(
            nn.Dropout(0.6),
            nn.Linear(num_embed, num_emotion),
        )

    def feat_single_img(self, x):
        x = self.backbone.extract_features(x)
        x = self.average_pool(x)
        x = x.flatten(1)
        x = self.embed(x)
        x = self.features(x)
        return x

    def forward(self, x):
        x = self.feat_single_img(x)
        logits_au = self.fc_au(x)
        att_au = torch.sigmoid(logits_au).unsqueeze(-1)
        x = x.unsqueeze(1)
        emotion_vec_list = torch.matmul(att_au, x)
        emotion_vec = emotion_vec_list.sum(1)
        logits_emotion = self.fc_emotion(emotion_vec)
        return logits_au, logits_emotion
--- a/modelscope/models/cv/face_emotion/face_alignment/init.py
+++ b/modelscope/models/cv/face_emotion/face_alignment/init.py
--- a/modelscope/models/cv/face_emotion/face_alignment/face.py
+++ b/modelscope/models/cv/face_emotion/face_alignment/face.py
@@ -0,0 +1,79 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os

 import cv2
 import numpy as np
 import tensorflow as tf


 def init(mod):
    PATH_TO_CKPT = mod
    net = tf.Graph()
    with net.as_default():
        od_graph_def = tf.GraphDef()
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.6
        with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')
            sess = tf.Session(graph=net, config=config)
    return sess, net


 def filter_bboxes_confs(shape,
                        imgsBboxes,
                        imgsConfs,
                        single=False,
                        thresh=0.5):
    [w, h] = shape
    if single:
        bboxes, confs = [], []
        for y in range(len(imgsBboxes)):
            if imgsConfs[y] >= thresh:
                [x1, y1, x2, y2] = list(imgsBboxes[y])
                x1, y1, x2, y2 = int(w * x1), int(h * y1), int(w * x2), int(
                    h * y2)
                bboxes.append([y1, x1, y2, x2])
                confs.append(imgsConfs[y])
        return bboxes, confs
    else:
        retImgsBboxes, retImgsConfs = [], []
        for x in range(len(imgsBboxes)):
            bboxes, confs = [], []
            for y in range(len(imgsBboxes[x])):
                if imgsConfs[x][y] >= thresh:
                    [x1, y1, x2, y2] = list(imgsBboxes[x][y])
                    x1, y1, x2, y2 = int(w * x1), int(h * y1), int(
                        w * x2), int(h * y2)
                    bboxes.append([y1, x1, y2, x2])
                    confs.append(imgsConfs[x][y])
            retImgsBboxes.append(bboxes)
            retImgsConfs.append(confs)
        return retImgsBboxes, retImgsConfs


 def detect(im, sess, net):
    image_np = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    image_np_expanded = np.expand_dims(image_np, axis=0)
    image_tensor = net.get_tensor_by_name('image_tensor:0')
    bboxes = net.get_tensor_by_name('detection_boxes:0')
    dConfs = net.get_tensor_by_name('detection_scores:0')
    classes = net.get_tensor_by_name('detection_classes:0')
    num_detections = net.get_tensor_by_name('num_detections:0')
    (bboxes, dConfs, classes,
     num_detections) = sess.run([bboxes, dConfs, classes, num_detections],
                                feed_dict={image_tensor: image_np_expanded})
    w, h, _ = im.shape
    bboxes, confs = filter_bboxes_confs([w, h], bboxes[0], dConfs[0], True)
    return bboxes, confs


 class FaceDetector:

    def __init__(self, mod):
        self.sess, self.net = init(mod)

    def do_detect(self, im):
        bboxes, confs = detect(im, self.sess, self.net)
        return bboxes, confs
--- a/modelscope/models/cv/face_emotion/face_alignment/face_align.py
+++ b/modelscope/models/cv/face_emotion/face_alignment/face_align.py
@@ -0,0 +1,59 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import os
 import sys

 import cv2
 import numpy as np
 from PIL import Image, ImageFile

 from .face import FaceDetector

 ImageFile.LOAD_TRUNCATED_IMAGES = True


 def adjust_bx_v2(box, w, h):
    x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
    box_w = x2 - x1
    box_h = y2 - y1
    delta = abs(box_w - box_h)
    if box_w > box_h:
        if y1 >= delta:
            y1 = y1 - delta
        else:
            delta_y1 = y1
            y1 = 0
            delta_y2 = delta - delta_y1
            y2 = y2 + delta_y2 if y2 < h - delta_y2 else h - 1
    else:
        if x1 >= delta / 2 and x2 <= w - delta / 2:
            x1 = x1 - delta / 2
            x2 = x2 + delta / 2
        elif x1 < delta / 2 and x2 <= w - delta / 2:
            delta_x1 = x1
            x1 = 0
            delta_x2 = delta - delta_x1
            x2 = x2 + delta_x2 if x2 < w - delta_x2 else w - 1
        elif x1 >= delta / 2 and x2 > w - delta / 2:
            delta_x2 = w - x2
            x2 = w - 1
            delta_x1 = delta - x1
            x1 = x1 - delta_x1 if x1 >= delta_x1 else 0

    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
    return [x1, y1, x2, y2]


 def face_detection_PIL_v2(image, face_model):
    crop_size = 112
    face_detector = FaceDetector(face_model)
    img = np.array(image)
    h, w = img.shape[0:2]
    bxs, conf = face_detector.do_detect(img)
    bx = bxs[0]
    bx = adjust_bx_v2(bx, w, h)
    x1, y1, x2, y2 = bx
    image = img[y1:y2, x1:x2, :]
    img = Image.fromarray(image)
    img = img.resize((crop_size, crop_size))
    bx = tuple(bx)
    return img, bx
--- a/modelscope/models/cv/face_generation/op/conv2d_gradfix.py
+++ b/modelscope/models/cv/face_generation/op/conv2d_gradfix.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
 # at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/conv2d_gradfix.py
 import contextlib
 import warnings

--- a/modelscope/models/cv/face_generation/op/fused_act.py
+++ b/modelscope/models/cv/face_generation/op/fused_act.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
 # t https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py
 import os

 import torch
--- a/modelscope/models/cv/face_generation/op/upfirdn2d.py
+++ b/modelscope/models/cv/face_generation/op/upfirdn2d.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
 # at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py
 import os
 from collections import abc

--- a/modelscope/models/cv/face_generation/stylegan2.py
+++ b/modelscope/models/cv/face_generation/stylegan2.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from stylegan2-pytorch,
 # made public available under the MIT License at https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
 import functools
 import math
 import operator
--- a/modelscope/models/cv/face_human_hand_detection/init.py
+++ b/modelscope/models/cv/face_human_hand_detection/init.py
@@ -0,0 +1,20 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .det_infer import NanoDetForFaceHumanHandDetection

 else:
    _import_structure = {'det_infer': ['NanoDetForFaceHumanHandDetection']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/face_human_hand_detection/det_infer.py
+++ b/modelscope/models/cv/face_human_hand_detection/det_infer.py
@@ -0,0 +1,133 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.

 import cv2
 import numpy as np
 import torch

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from .one_stage_detector import OneStageDetector

 logger = get_logger()


 def load_model_weight(model_dir, device):
    checkpoint = torch.load(
        '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
        map_location=device)
    state_dict = checkpoint['state_dict'].copy()
    for k in checkpoint['state_dict']:
        if k.startswith('avg_model.'):
            v = state_dict.pop(k)
            state_dict[k[4:]] = v

    return state_dict


@MODELS.register_module(
    Tasks.face_human_hand_detection,
    module_name=Models.face_human_hand_detection)
 class NanoDetForFaceHumanHandDetection(TorchModel):

    def __init__(self, model_dir, device_id=0, *args, **kwargs):

        super().__init__(
            model_dir=model_dir, device_id=device_id, *args, **kwargs)

        self.model = OneStageDetector()
        if torch.cuda.is_available():
            self.device = 'cuda'
            logger.info('Use GPU ')
        else:
            self.device = 'cpu'
            logger.info('Use CPU')

        self.state_dict = load_model_weight(model_dir, self.device)
        self.model.load_state_dict(self.state_dict, strict=False)
        self.model.eval()
        self.model.to(self.device)

    def forward(self, x):
        pred_result = self.model.inference(x)
        return pred_result


 def naive_collate(batch):
    elem = batch[0]
    if isinstance(elem, dict):
        return {key: naive_collate([d[key] for d in batch]) for key in elem}
    else:
        return batch


 def get_resize_matrix(raw_shape, dst_shape):

    r_w, r_h = raw_shape
    d_w, d_h = dst_shape
    Rs = np.eye(3)

    Rs[0, 0] *= d_w / r_w
    Rs[1, 1] *= d_h / r_h
    return Rs


 def color_aug_and_norm(meta, mean, std):
    img = meta['img'].astype(np.float32) / 255
    mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3) / 255
    std = np.array(std, dtype=np.float32).reshape(1, 1, 3) / 255
    img = (img - mean) / std
    meta['img'] = img
    return meta


 def img_process(meta, mean, std):
    raw_img = meta['img']
    height = raw_img.shape[0]
    width = raw_img.shape[1]
    dst_shape = [320, 320]
    M = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
    ResizeM = get_resize_matrix((width, height), dst_shape)
    M = ResizeM @ M
    img = cv2.warpPerspective(raw_img, M, dsize=tuple(dst_shape))
    meta['img'] = img
    meta['warp_matrix'] = M
    meta = color_aug_and_norm(meta, mean, std)
    return meta


 def overlay_bbox_cv(dets, class_names, score_thresh):
    all_box = []
    for label in dets:
        for bbox in dets[label]:
            score = bbox[-1]
            if score > score_thresh:
                x0, y0, x1, y1 = [int(i) for i in bbox[:4]]
                all_box.append([label, x0, y0, x1, y1, score])
    all_box.sort(key=lambda v: v[5])
    return all_box


 mean = [103.53, 116.28, 123.675]
 std = [57.375, 57.12, 58.395]
 class_names = ['person', 'face', 'hand']


 def inference(model, device, img_path):
    img_info = {'id': 0}
    img = cv2.imread(img_path)
    height, width = img.shape[:2]
    img_info['height'] = height
    img_info['width'] = width
    meta = dict(img_info=img_info, raw_img=img, img=img)

    meta = img_process(meta, mean, std)
    meta['img'] = torch.from_numpy(meta['img'].transpose(2, 0, 1)).to(device)
    meta = naive_collate([meta])
    meta['img'] = (meta['img'][0]).reshape(1, 3, 320, 320)
    with torch.no_grad():
        res = model(meta)
    result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35)
    return result
--- a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
+++ b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
@@ -0,0 +1,395 @@
 # The implementation here is modified based on nanodet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

 import math

 import torch
 import torch.nn as nn

 from .utils import ConvModule, DepthwiseConvModule, act_layers


 def _make_divisible(v, divisor, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


 def hard_sigmoid(x, inplace: bool = False):
    if inplace:
        return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0)
    else:
        return F.relu6(x + 3.0) / 6.0


 class SqueezeExcite(nn.Module):

    def __init__(self,
                 in_chs,
                 se_ratio=0.25,
                 reduced_base_chs=None,
                 activation='ReLU',
                 gate_fn=hard_sigmoid,
                 divisor=4,
                 **_):
        super(SqueezeExcite, self).__init__()
        self.gate_fn = gate_fn
        reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio,
                                      divisor)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
        self.act1 = act_layers(activation)
        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)

    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.conv_reduce(x_se)
        x_se = self.act1(x_se)
        x_se = self.conv_expand(x_se)
        x = x * self.gate_fn(x_se)
        return x


 class GhostModule(nn.Module):

    def __init__(self,
                 inp,
                 oup,
                 kernel_size=1,
                 ratio=2,
                 dw_size=3,
                 stride=1,
                 activation='ReLU'):
        super(GhostModule, self).__init__()
        self.oup = oup
        init_channels = math.ceil(oup / ratio)
        new_channels = init_channels * (ratio - 1)

        self.primary_conv = nn.Sequential(
            nn.Conv2d(
                inp,
                init_channels,
                kernel_size,
                stride,
                kernel_size // 2,
                bias=False),
            nn.BatchNorm2d(init_channels),
            act_layers(activation) if activation else nn.Sequential(),
        )

        self.cheap_operation = nn.Sequential(
            nn.Conv2d(
                init_channels,
                new_channels,
                dw_size,
                1,
                dw_size // 2,
                groups=init_channels,
                bias=False,
            ),
            nn.BatchNorm2d(new_channels),
            act_layers(activation) if activation else nn.Sequential(),
        )

    def forward(self, x):
        x1 = self.primary_conv(x)
        x2 = self.cheap_operation(x1)
        out = torch.cat([x1, x2], dim=1)
        return out


 class GhostBottleneck(nn.Module):
    """Ghost bottleneck w/ optional SE"""

    def __init__(
        self,
        in_chs,
        mid_chs,
        out_chs,
        dw_kernel_size=3,
        stride=1,
        activation='ReLU',
        se_ratio=0.0,
    ):
        super(GhostBottleneck, self).__init__()
        has_se = se_ratio is not None and se_ratio > 0.0
        self.stride = stride

        # Point-wise expansion
        self.ghost1 = GhostModule(in_chs, mid_chs, activation=activation)

        # Depth-wise convolution
        if self.stride > 1:
            self.conv_dw = nn.Conv2d(
                mid_chs,
                mid_chs,
                dw_kernel_size,
                stride=stride,
                padding=(dw_kernel_size - 1) // 2,
                groups=mid_chs,
                bias=False,
            )
            self.bn_dw = nn.BatchNorm2d(mid_chs)

        if has_se:
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
        else:
            self.se = None

        self.ghost2 = GhostModule(mid_chs, out_chs, activation=None)

        if in_chs == out_chs and self.stride == 1:
            self.shortcut = nn.Sequential()
        else:
            self.shortcut = nn.Sequential(
                nn.Conv2d(
                    in_chs,
                    in_chs,
                    dw_kernel_size,
                    stride=stride,
                    padding=(dw_kernel_size - 1) // 2,
                    groups=in_chs,
                    bias=False,
                ),
                nn.BatchNorm2d(in_chs),
                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_chs),
            )

    def forward(self, x):
        residual = x

        x = self.ghost1(x)

        if self.stride > 1:
            x = self.conv_dw(x)
            x = self.bn_dw(x)

        if self.se is not None:
            x = self.se(x)

        x = self.ghost2(x)

        x += self.shortcut(residual)
        return x


 class GhostBlocks(nn.Module):
    """Stack of GhostBottleneck used in GhostPAN.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        expand (int): Expand ratio of GhostBottleneck. Default: 1.
        kernel_size (int): Kernel size of depthwise convolution. Default: 5.
        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
        use_res (bool): Whether to use residual connection. Default: False.
        activation (str): Name of activation function. Default: LeakyReLU.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        expand=1,
        kernel_size=5,
        num_blocks=1,
        use_res=False,
        activation='LeakyReLU',
    ):
        super(GhostBlocks, self).__init__()
        self.use_res = use_res
        if use_res:
            self.reduce_conv = ConvModule(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=1,
                padding=0,
                activation=activation,
            )
        blocks = []
        for _ in range(num_blocks):
            blocks.append(
                GhostBottleneck(
                    in_channels,
                    int(out_channels * expand),
                    out_channels,
                    dw_kernel_size=kernel_size,
                    activation=activation,
                ))
        self.blocks = nn.Sequential(*blocks)

    def forward(self, x):
        out = self.blocks(x)
        if self.use_res:
            out = out + self.reduce_conv(x)
        return out


 class GhostPAN(nn.Module):
    """Path Aggregation Network with Ghost block.

    Args:
        in_channels (List[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale)
        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
        use_depthwise (bool): Whether to depthwise separable convolution in
            blocks. Default: False
        kernel_size (int): Kernel size of depthwise convolution. Default: 5.
        expand (int): Expand ratio of GhostBottleneck. Default: 1.
        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
        use_res (bool): Whether to use residual connection. Default: False.
        num_extra_level (int): Number of extra conv layers for more feature levels.
            Default: 0.
        upsample_cfg (dict): Config dict for interpolate layer.
            Default: `dict(scale_factor=2, mode='nearest')`
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='BN')
        activation (str): Activation layer name.
            Default: LeakyReLU.
    """

    def __init__(
            self,
            in_channels,
            out_channels,
            use_depthwise=False,
            kernel_size=5,
            expand=1,
            num_blocks=1,
            use_res=False,
            num_extra_level=0,
            upsample_cfg=dict(scale_factor=2, mode='bilinear'),
            norm_cfg=dict(type='BN'),
            activation='LeakyReLU',
    ):
        super(GhostPAN, self).__init__()
        assert num_extra_level >= 0
        assert num_blocks >= 1
        self.in_channels = in_channels
        self.out_channels = out_channels

        conv = DepthwiseConvModule if use_depthwise else ConvModule

        # build top-down blocks
        self.upsample = nn.Upsample(**upsample_cfg)
        self.reduce_layers = nn.ModuleList()
        for idx in range(len(in_channels)):
            self.reduce_layers.append(
                ConvModule(
                    in_channels[idx],
                    out_channels,
                    1,
                    norm_cfg=norm_cfg,
                    activation=activation,
                ))
        self.top_down_blocks = nn.ModuleList()
        for idx in range(len(in_channels) - 1, 0, -1):
            self.top_down_blocks.append(
                GhostBlocks(
                    out_channels * 2,
                    out_channels,
                    expand,
                    kernel_size=kernel_size,
                    num_blocks=num_blocks,
                    use_res=use_res,
                    activation=activation,
                ))

        # build bottom-up blocks
        self.downsamples = nn.ModuleList()
        self.bottom_up_blocks = nn.ModuleList()
        for idx in range(len(in_channels) - 1):
            self.downsamples.append(
                conv(
                    out_channels,
                    out_channels,
                    kernel_size,
                    stride=2,
                    padding=kernel_size // 2,
                    norm_cfg=norm_cfg,
                    activation=activation,
                ))
            self.bottom_up_blocks.append(
                GhostBlocks(
                    out_channels * 2,
                    out_channels,
                    expand,
                    kernel_size=kernel_size,
                    num_blocks=num_blocks,
                    use_res=use_res,
                    activation=activation,
                ))

        # extra layers
        self.extra_lvl_in_conv = nn.ModuleList()
        self.extra_lvl_out_conv = nn.ModuleList()
        for i in range(num_extra_level):
            self.extra_lvl_in_conv.append(
                conv(
                    out_channels,
                    out_channels,
                    kernel_size,
                    stride=2,
                    padding=kernel_size // 2,
                    norm_cfg=norm_cfg,
                    activation=activation,
                ))
            self.extra_lvl_out_conv.append(
                conv(
                    out_channels,
                    out_channels,
                    kernel_size,
                    stride=2,
                    padding=kernel_size // 2,
                    norm_cfg=norm_cfg,
                    activation=activation,
                ))

    def forward(self, inputs):
        """
        Args:
            inputs (tuple[Tensor]): input features.
        Returns:
            tuple[Tensor]: multi level features.
        """
        assert len(inputs) == len(self.in_channels)
        inputs = [
            reduce(input_x)
            for input_x, reduce in zip(inputs, self.reduce_layers)
        ]
        # top-down path
        inner_outs = [inputs[-1]]
        for idx in range(len(self.in_channels) - 1, 0, -1):
            feat_heigh = inner_outs[0]
            feat_low = inputs[idx - 1]

            inner_outs[0] = feat_heigh

            upsample_feat = self.upsample(feat_heigh)

            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
                torch.cat([upsample_feat, feat_low], 1))
            inner_outs.insert(0, inner_out)

        # bottom-up path
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels) - 1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx + 1]
            downsample_feat = self.downsamples[idx](feat_low)
            out = self.bottom_up_blocks[idx](
                torch.cat([downsample_feat, feat_height], 1))
            outs.append(out)

        # extra layers
        for extra_in_layer, extra_out_layer in zip(self.extra_lvl_in_conv,
                                                   self.extra_lvl_out_conv):
            outs.append(extra_in_layer(inputs[-1]) + extra_out_layer(outs[-1]))

        return tuple(outs)
--- a/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
+++ b/modelscope/models/cv/face_human_hand_detection/nanodet_plus_head.py
@@ -0,0 +1,427 @@
 # The implementation here is modified based on nanodet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

 import math

 import cv2
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torchvision.ops import nms

 from .utils import ConvModule, DepthwiseConvModule


 class Integral(nn.Module):
    """A fixed layer for calculating integral result from distribution.
    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
    P(y_i) denotes the softmax vector that represents the discrete distribution
    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
    Args:
        reg_max (int): The maximal value of the discrete set. Default: 16. You
            may want to reset it according to your new dataset or related
            settings.
    """

    def __init__(self, reg_max=16):
        super(Integral, self).__init__()
        self.reg_max = reg_max
        self.register_buffer('project',
                             torch.linspace(0, self.reg_max, self.reg_max + 1))

    def forward(self, x):
        """Forward feature from the regression head to get integral result of
        bounding box location.
        Args:
            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
                n is self.reg_max.
        Returns:
            x (Tensor): Integral result of box locations, i.e., distance
                offsets from the box center in four directions, shape (N, 4).
        """
        shape = x.size()
        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
        x = F.linear(x, self.project.type_as(x)).reshape(*shape[:-1], 4)
        return x


 def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
    """Performs non-maximum suppression in a batched fashion.
    Modified from https://github.com/pytorch/vision/blob
    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
    In order to perform NMS independently per class, we add an offset to all
    the boxes. The offset is dependent only on the class idx, and is large
    enough so that boxes from different classes do not overlap.
    Arguments:
        boxes (torch.Tensor): boxes in shape (N, 4).
        scores (torch.Tensor): scores in shape (N, ).
        idxs (torch.Tensor): each index value correspond to a bbox cluster,
            and NMS will not be applied between elements of different idxs,
            shape (N, ).
        nms_cfg (dict): specify nms type and other parameters like iou_thr.
            Possible keys includes the following.
            - iou_thr (float): IoU threshold used for NMS.
            - split_thr (float): threshold number of boxes. In some cases the
                number of boxes is large (e.g., 200k). To avoid OOM during
                training, the users could set `split_thr` to a small value.
                If the number of boxes is greater than the threshold, it will
                perform NMS on each group of boxes separately and sequentially.
                Defaults to 10000.
        class_agnostic (bool): if true, nms is class agnostic,
            i.e. IoU thresholding happens over all boxes,
            regardless of the predicted class.
    Returns:
        tuple: kept dets and indice.
    """
    nms_cfg_ = nms_cfg.copy()
    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
    if class_agnostic:
        boxes_for_nms = boxes
    else:
        max_coordinate = boxes.max()
        offsets = idxs.to(boxes) * (max_coordinate + 1)
        boxes_for_nms = boxes + offsets[:, None]
    nms_cfg_.pop('type', 'nms')
    split_thr = nms_cfg_.pop('split_thr', 10000)
    if len(boxes_for_nms) < split_thr:
        keep = nms(boxes_for_nms, scores, **nms_cfg_)
        boxes = boxes[keep]
        scores = scores[keep]
    else:
        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
        for id in torch.unique(idxs):
            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
            keep = nms(boxes_for_nms[mask], scores[mask], **nms_cfg_)
            total_mask[mask[keep]] = True

        keep = total_mask.nonzero(as_tuple=False).view(-1)
        keep = keep[scores[keep].argsort(descending=True)]
        boxes = boxes[keep]
        scores = scores[keep]

    return torch.cat([boxes, scores[:, None]], -1), keep


 def multiclass_nms(multi_bboxes,
                   multi_scores,
                   score_thr,
                   nms_cfg,
                   max_num=-1,
                   score_factors=None):
    """NMS for multi-class bboxes.

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_scores (Tensor): shape (n, #class), where the last column
            contains scores of the background class, but this will be ignored.
        score_thr (float): bbox threshold, bboxes with scores lower than it
            will not be considered.
        nms_thr (float): NMS IoU threshold
        max_num (int): if there are more than max_num bboxes after NMS,
            only top max_num will be kept.
        score_factors (Tensor): The factors multiplied to scores before
            applying NMS

    Returns:
        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
            are 0-based.
    """
    num_classes = multi_scores.size(1) - 1
    if multi_bboxes.shape[1] > 4:
        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
    else:
        bboxes = multi_bboxes[:, None].expand(
            multi_scores.size(0), num_classes, 4)
    scores = multi_scores[:, :-1]

    valid_mask = scores > score_thr

    bboxes = torch.masked_select(
        bboxes,
        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
                    -1)).view(-1, 4)
    if score_factors is not None:
        scores = scores * score_factors[:, None]
    scores = torch.masked_select(scores, valid_mask)
    labels = valid_mask.nonzero(as_tuple=False)[:, 1]

    if bboxes.numel() == 0:
        bboxes = multi_bboxes.new_zeros((0, 5))
        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)

        if torch.onnx.is_in_onnx_export():
            raise RuntimeError('[ONNX Error] Can not record NMS '
                               'as it has not been executed this time')
        return bboxes, labels

    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)

    if max_num > 0:
        dets = dets[:max_num]
        keep = keep[:max_num]

    return dets, labels[keep]


 def distance2bbox(points, distance, max_shape=None):
    """Decode distance prediction to bounding box.

    Args:
        points (Tensor): Shape (n, 2), [x, y].
        distance (Tensor): Distance from the given point to 4
            boundaries (left, top, right, bottom).
        max_shape (tuple): Shape of the image.

    Returns:
        Tensor: Decoded bboxes.
    """
    x1 = points[..., 0] - distance[..., 0]
    y1 = points[..., 1] - distance[..., 1]
    x2 = points[..., 0] + distance[..., 2]
    y2 = points[..., 1] + distance[..., 3]
    if max_shape is not None:
        x1 = x1.clamp(min=0, max=max_shape[1])
        y1 = y1.clamp(min=0, max=max_shape[0])
        x2 = x2.clamp(min=0, max=max_shape[1])
        y2 = y2.clamp(min=0, max=max_shape[0])
    return torch.stack([x1, y1, x2, y2], -1)


 def warp_boxes(boxes, M, width, height):
    n = len(boxes)
    if n:
        xy = np.ones((n * 4, 3))
        xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)
        xy = xy @ M.T
        xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)
        x = xy[:, [0, 2, 4, 6]]
        y = xy[:, [1, 3, 5, 7]]
        xy = np.concatenate(
            (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
        return xy.astype(np.float32)
    else:
        return boxes


 class NanoDetPlusHead(nn.Module):
    """Detection head used in NanoDet-Plus.

    Args:
        num_classes (int): Number of categories excluding the background
            category.
        loss (dict): Loss config.
        input_channel (int): Number of channels of the input feature.
        feat_channels (int): Number of channels of the feature.
            Default: 96.
        stacked_convs (int): Number of conv layers in the stacked convs.
            Default: 2.
        kernel_size (int): Size of the convolving kernel. Default: 5.
        strides (list[int]): Strides of input multi-level feature maps.
            Default: [8, 16, 32].
        conv_type (str): Type of the convolution.
            Default: "DWConv".
        norm_cfg (dict): Dictionary to construct and config norm layer.
            Default: dict(type='BN').
        reg_max (int): The maximal value of the discrete set. Default: 7.
        activation (str): Type of activation function. Default: "LeakyReLU".
        assigner_cfg (dict): Config dict of the assigner. Default: dict(topk=13).
    """

    def __init__(self,
                 num_classes,
                 input_channel,
                 feat_channels=96,
                 stacked_convs=2,
                 kernel_size=5,
                 strides=[8, 16, 32],
                 conv_type='DWConv',
                 norm_cfg=dict(type='BN'),
                 reg_max=7,
                 activation='LeakyReLU',
                 assigner_cfg=dict(topk=13),
                 **kwargs):
        super(NanoDetPlusHead, self).__init__()
        self.num_classes = num_classes
        self.in_channels = input_channel
        self.feat_channels = feat_channels
        self.stacked_convs = stacked_convs
        self.kernel_size = kernel_size
        self.strides = strides
        self.reg_max = reg_max
        self.activation = activation
        self.ConvModule = ConvModule if conv_type == 'Conv' else DepthwiseConvModule

        self.norm_cfg = norm_cfg
        self.distribution_project = Integral(self.reg_max)

        self._init_layers()

    def _init_layers(self):
        self.cls_convs = nn.ModuleList()
        for _ in self.strides:
            cls_convs = self._buid_not_shared_head()
            self.cls_convs.append(cls_convs)

        self.gfl_cls = nn.ModuleList([
            nn.Conv2d(
                self.feat_channels,
                self.num_classes + 4 * (self.reg_max + 1),
                1,
                padding=0,
            ) for _ in self.strides
        ])

    def _buid_not_shared_head(self):
        cls_convs = nn.ModuleList()
        for i in range(self.stacked_convs):
            chn = self.in_channels if i == 0 else self.feat_channels
            cls_convs.append(
                self.ConvModule(
                    chn,
                    self.feat_channels,
                    self.kernel_size,
                    stride=1,
                    padding=self.kernel_size // 2,
                    norm_cfg=self.norm_cfg,
                    bias=self.norm_cfg is None,
                    activation=self.activation,
                ))
        return cls_convs

    def forward(self, feats):
        if torch.onnx.is_in_onnx_export():
            return self._forward_onnx(feats)
        outputs = []
        for feat, cls_convs, gfl_cls in zip(
                feats,
                self.cls_convs,
                self.gfl_cls,
        ):
            for conv in cls_convs:
                feat = conv(feat)
            output = gfl_cls(feat)
            outputs.append(output.flatten(start_dim=2))
        outputs = torch.cat(outputs, dim=2).permute(0, 2, 1)
        return outputs

    def post_process(self, preds, meta):
        """Prediction results post processing. Decode bboxes and rescale
        to original image size.
        Args:
            preds (Tensor): Prediction output.
            meta (dict): Meta info.
        """
        cls_scores, bbox_preds = preds.split(
            [self.num_classes, 4 * (self.reg_max + 1)], dim=-1)
        result_list = self.get_bboxes(cls_scores, bbox_preds, meta)
        det_results = {}
        warp_matrixes = (
            meta['warp_matrix']
            if isinstance(meta['warp_matrix'], list) else meta['warp_matrix'])
        img_heights = (
            meta['img_info']['height'].cpu().numpy() if isinstance(
                meta['img_info']['height'], torch.Tensor) else
            meta['img_info']['height'])
        img_widths = (
            meta['img_info']['width'].cpu().numpy() if isinstance(
                meta['img_info']['width'], torch.Tensor) else
            meta['img_info']['width'])
        img_ids = (
            meta['img_info']['id'].cpu().numpy() if isinstance(
                meta['img_info']['id'], torch.Tensor) else
            meta['img_info']['id'])

        for result, img_width, img_height, img_id, warp_matrix in zip(
                result_list, img_widths, img_heights, img_ids, warp_matrixes):
            det_result = {}
            det_bboxes, det_labels = result
            det_bboxes = det_bboxes.detach().cpu().numpy()
            det_bboxes[:, :4] = warp_boxes(det_bboxes[:, :4],
                                           np.linalg.inv(warp_matrix),
                                           img_width, img_height)
            classes = det_labels.detach().cpu().numpy()
            for i in range(self.num_classes):
                inds = classes == i
                det_result[i] = np.concatenate(
                    [
                        det_bboxes[inds, :4].astype(np.float32),
                        det_bboxes[inds, 4:5].astype(np.float32),
                    ],
                    axis=1,
                ).tolist()
            det_results[img_id] = det_result
        return det_results

    def get_bboxes(self, cls_preds, reg_preds, img_metas):
        """Decode the outputs to bboxes.
        Args:
            cls_preds (Tensor): Shape (num_imgs, num_points, num_classes).
            reg_preds (Tensor): Shape (num_imgs, num_points, 4 * (regmax + 1)).
            img_metas (dict): Dict of image info.

        Returns:
            results_list (list[tuple]): List of detection bboxes and labels.
        """
        device = cls_preds.device
        b = cls_preds.shape[0]
        input_height, input_width = img_metas['img'].shape[2:]
        input_shape = (input_height, input_width)

        featmap_sizes = [(math.ceil(input_height / stride),
                          math.ceil(input_width) / stride)
                         for stride in self.strides]
        mlvl_center_priors = [
            self.get_single_level_center_priors(
                b,
                featmap_sizes[i],
                stride,
                dtype=torch.float32,
                device=device,
            ) for i, stride in enumerate(self.strides)
        ]
        center_priors = torch.cat(mlvl_center_priors, dim=1)
        dis_preds = self.distribution_project(reg_preds) * center_priors[...,
                                                                         2,
                                                                         None]
        bboxes = distance2bbox(
            center_priors[..., :2], dis_preds, max_shape=input_shape)
        scores = cls_preds.sigmoid()
        result_list = []
        for i in range(b):
            score, bbox = scores[i], bboxes[i]
            padding = score.new_zeros(score.shape[0], 1)
            score = torch.cat([score, padding], dim=1)
            results = multiclass_nms(
                bbox,
                score,
                score_thr=0.05,
                nms_cfg=dict(type='nms', iou_threshold=0.6),
                max_num=100,
            )
            result_list.append(results)
        return result_list

    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
                                       dtype, device):
        """Generate centers of a single stage feature map.
        Args:
            batch_size (int): Number of images in one batch.
            featmap_size (tuple[int]): height and width of the feature map
            stride (int): down sample stride of the feature map
            dtype (obj:`torch.dtype`): data type of the tensors
            device (obj:`torch.device`): device of the tensors
        Return:
            priors (Tensor): center priors of a single level feature map.
        """
        h, w = featmap_size
        x_range = (torch.arange(w, dtype=dtype, device=device)) * stride
        y_range = (torch.arange(h, dtype=dtype, device=device)) * stride
        y, x = torch.meshgrid(y_range, x_range)
        y = y.flatten()
        x = x.flatten()
        strides = x.new_full((x.shape[0], ), stride)
        proiors = torch.stack([x, y, strides, strides], dim=-1)
        return proiors.unsqueeze(0).repeat(batch_size, 1, 1)
--- a/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
+++ b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
@@ -0,0 +1,64 @@
 # The implementation here is modified based on nanodet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

 import torch
 import torch.nn as nn

 from .ghost_pan import GhostPAN
 from .nanodet_plus_head import NanoDetPlusHead
 from .shufflenetv2 import ShuffleNetV2


 class OneStageDetector(nn.Module):

    def __init__(self):
        super(OneStageDetector, self).__init__()
        self.backbone = ShuffleNetV2(
            model_size='1.0x',
            out_stages=(2, 3, 4),
            with_last_conv=False,
            kernal_size=3,
            activation='LeakyReLU',
            pretrain=False)
        self.fpn = GhostPAN(
            in_channels=[116, 232, 464],
            out_channels=96,
            use_depthwise=True,
            kernel_size=5,
            expand=1,
            num_blocks=1,
            use_res=False,
            num_extra_level=1,
            upsample_cfg=dict(scale_factor=2, mode='bilinear'),
            norm_cfg=dict(type='BN'),
            activation='LeakyReLU')
        self.head = NanoDetPlusHead(
            num_classes=3,
            input_channel=96,
            feat_channels=96,
            stacked_convs=2,
            kernel_size=5,
            strides=[8, 16, 32, 64],
            conv_type='DWConv',
            norm_cfg=dict(type='BN'),
            reg_max=7,
            activation='LeakyReLU',
            assigner_cfg=dict(topk=13))
        self.epoch = 0

    def forward(self, x):
        x = self.backbone(x)
        if hasattr(self, 'fpn'):
            x = self.fpn(x)
        if hasattr(self, 'head'):
            x = self.head(x)
        return x

    def inference(self, meta):
        with torch.no_grad():
            torch.cuda.synchronize()
            preds = self(meta['img'])
            torch.cuda.synchronize()
            results = self.head.post_process(preds, meta)
            torch.cuda.synchronize()
        return results
--- a/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
+++ b/modelscope/models/cv/face_human_hand_detection/shufflenetv2.py
@@ -0,0 +1,182 @@
 # The implementation here is modified based on nanodet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

 import torch
 import torch.nn as nn

 from .utils import act_layers


 def channel_shuffle(x, groups):
    batchsize, num_channels, height, width = x.data.size()
    channels_per_group = num_channels // groups

    x = x.view(batchsize, groups, channels_per_group, height, width)

    x = torch.transpose(x, 1, 2).contiguous()

    x = x.view(batchsize, -1, height, width)

    return x


 class ShuffleV2Block(nn.Module):

    def __init__(self, inp, oup, stride, activation='ReLU'):
        super(ShuffleV2Block, self).__init__()

        if not (1 <= stride <= 3):
            raise ValueError('illegal stride value')
        self.stride = stride

        branch_features = oup // 2
        assert (self.stride != 1) or (inp == branch_features << 1)

        if self.stride > 1:
            self.branch1 = nn.Sequential(
                self.depthwise_conv(
                    inp, inp, kernel_size=3, stride=self.stride, padding=1),
                nn.BatchNorm2d(inp),
                nn.Conv2d(
                    inp,
                    branch_features,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    bias=False),
                nn.BatchNorm2d(branch_features),
                act_layers(activation),
            )
        else:
            self.branch1 = nn.Sequential()

        self.branch2 = nn.Sequential(
            nn.Conv2d(
                inp if (self.stride > 1) else branch_features,
                branch_features,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=False,
            ),
            nn.BatchNorm2d(branch_features),
            act_layers(activation),
            self.depthwise_conv(
                branch_features,
                branch_features,
                kernel_size=3,
                stride=self.stride,
                padding=1,
            ),
            nn.BatchNorm2d(branch_features),
            nn.Conv2d(
                branch_features,
                branch_features,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=False,
            ),
            nn.BatchNorm2d(branch_features),
            act_layers(activation),
        )

    @staticmethod
    def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
        return nn.Conv2d(
            i, o, kernel_size, stride, padding, bias=bias, groups=i)

    def forward(self, x):
        if self.stride == 1:
            x1, x2 = x.chunk(2, dim=1)
            out = torch.cat((x1, self.branch2(x2)), dim=1)
        else:
            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)

        out = channel_shuffle(out, 2)

        return out


 class ShuffleNetV2(nn.Module):

    def __init__(
        self,
        model_size='1.5x',
        out_stages=(2, 3, 4),
        with_last_conv=False,
        kernal_size=3,
        activation='ReLU',
        pretrain=True,
    ):
        super(ShuffleNetV2, self).__init__()
        assert set(out_stages).issubset((2, 3, 4))

        print('model size is ', model_size)

        self.stage_repeats = [4, 8, 4]
        self.model_size = model_size
        self.out_stages = out_stages
        self.with_last_conv = with_last_conv
        self.kernal_size = kernal_size
        self.activation = activation
        if model_size == '0.5x':
            self._stage_out_channels = [24, 48, 96, 192, 1024]
        elif model_size == '1.0x':
            self._stage_out_channels = [24, 116, 232, 464, 1024]
        elif model_size == '1.5x':
            self._stage_out_channels = [24, 176, 352, 704, 1024]
        elif model_size == '2.0x':
            self._stage_out_channels = [24, 244, 488, 976, 2048]
        else:
            raise NotImplementedError

        # building first layer
        input_channels = 3
        output_channels = self._stage_out_channels[0]
        self.conv1 = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
            nn.BatchNorm2d(output_channels),
            act_layers(activation),
        )
        input_channels = output_channels

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
        for name, repeats, output_channels in zip(
                stage_names, self.stage_repeats, self._stage_out_channels[1:]):
            seq = [
                ShuffleV2Block(
                    input_channels, output_channels, 2, activation=activation)
            ]
            for i in range(repeats - 1):
                seq.append(
                    ShuffleV2Block(
                        output_channels,
                        output_channels,
                        1,
                        activation=activation))
            setattr(self, name, nn.Sequential(*seq))
            input_channels = output_channels
        output_channels = self._stage_out_channels[-1]
        if self.with_last_conv:
            conv5 = nn.Sequential(
                nn.Conv2d(
                    input_channels, output_channels, 1, 1, 0, bias=False),
                nn.BatchNorm2d(output_channels),
                act_layers(activation),
            )
            self.stage4.add_module('conv5', conv5)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        output = []

        for i in range(2, 5):
            stage = getattr(self, 'stage{}'.format(i))
            x = stage(x)
            if i in self.out_stages:
                output.append(x)
        return tuple(output)
--- a/modelscope/models/cv/face_human_hand_detection/utils.py
+++ b/modelscope/models/cv/face_human_hand_detection/utils.py
@@ -0,0 +1,277 @@
 # The implementation here is modified based on nanodet,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/RangiLyu/nanodet

 import torch
 import torch.nn as nn

 activations = {
    'ReLU': nn.ReLU,
    'LeakyReLU': nn.LeakyReLU,
    'ReLU6': nn.ReLU6,
    'SELU': nn.SELU,
    'ELU': nn.ELU,
    'GELU': nn.GELU,
    'PReLU': nn.PReLU,
    'SiLU': nn.SiLU,
    'HardSwish': nn.Hardswish,
    'Hardswish': nn.Hardswish,
    None: nn.Identity,
 }


 def act_layers(name):
    assert name in activations.keys()
    if name == 'LeakyReLU':
        return nn.LeakyReLU(negative_slope=0.1, inplace=True)
    elif name == 'GELU':
        return nn.GELU()
    elif name == 'PReLU':
        return nn.PReLU()
    else:
        return activations[name](inplace=True)


 norm_cfg = {
    'BN': ('bn', nn.BatchNorm2d),
    'SyncBN': ('bn', nn.SyncBatchNorm),
    'GN': ('gn', nn.GroupNorm),
 }


 def build_norm_layer(cfg, num_features, postfix=''):
    """Build normalization layer

    Args:
        cfg (dict): cfg should contain:
            type (str): identify norm layer type.
            layer args: args needed to instantiate a norm layer.
            requires_grad (bool): [optional] whether stop gradient updates
        num_features (int): number of channels from input.
        postfix (int, str): appended into norm abbreviation to
            create named layer.

    Returns:
        name (str): abbreviation + postfix
        layer (nn.Module): created norm layer
    """
    assert isinstance(cfg, dict) and 'type' in cfg
    cfg_ = cfg.copy()

    layer_type = cfg_.pop('type')
    if layer_type not in norm_cfg:
        raise KeyError('Unrecognized norm type {}'.format(layer_type))
    else:
        abbr, norm_layer = norm_cfg[layer_type]
        if norm_layer is None:
            raise NotImplementedError

    assert isinstance(postfix, (int, str))
    name = abbr + str(postfix)

    requires_grad = cfg_.pop('requires_grad', True)
    cfg_.setdefault('eps', 1e-5)
    if layer_type != 'GN':
        layer = norm_layer(num_features, **cfg_)
        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
            layer._specify_ddp_gpu_num(1)
    else:
        assert 'num_groups' in cfg_
        layer = norm_layer(num_channels=num_features, **cfg_)

    for param in layer.parameters():
        param.requires_grad = requires_grad

    return name, layer


 class ConvModule(nn.Module):
    """A conv block that contains conv/norm/activation layers.

    Args:
        in_channels (int): Same as nn.Conv2d.
        out_channels (int): Same as nn.Conv2d.
        kernel_size (int or tuple[int]): Same as nn.Conv2d.
        stride (int or tuple[int]): Same as nn.Conv2d.
        padding (int or tuple[int]): Same as nn.Conv2d.
        dilation (int or tuple[int]): Same as nn.Conv2d.
        groups (int): Same as nn.Conv2d.
        bias (bool or str): If specified as `auto`, it will be decided by the
            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
            False.
        conv_cfg (dict): Config dict for convolution layer.
        norm_cfg (dict): Config dict for normalization layer.
        activation (str): activation layer, "ReLU" by default.
        inplace (bool): Whether to use inplace mode for activation.
        order (tuple[str]): The order of conv/norm/activation layers. It is a
            sequence of "conv", "norm" and "act". Examples are
            ("conv", "norm", "act") and ("act", "conv", "norm").
    """

    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            groups=1,
            bias='auto',
            conv_cfg=None,
            norm_cfg=None,
            activation='ReLU',
            inplace=True,
            order=('conv', 'norm', 'act'),
    ):
        super(ConvModule, self).__init__()
        assert conv_cfg is None or isinstance(conv_cfg, dict)
        assert norm_cfg is None or isinstance(norm_cfg, dict)
        assert activation is None or isinstance(activation, str)
        self.conv_cfg = conv_cfg
        self.norm_cfg = norm_cfg
        self.activation = activation
        self.inplace = inplace
        self.order = order
        assert isinstance(self.order, tuple) and len(self.order) == 3
        assert set(order) == {'conv', 'norm', 'act'}

        self.with_norm = norm_cfg is not None
        if bias == 'auto':
            bias = False if self.with_norm else True
        self.with_bias = bias

        if self.with_norm and self.with_bias:
            warnings.warn('ConvModule has norm and bias at the same time')

        self.conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )
        self.in_channels = self.conv.in_channels
        self.out_channels = self.conv.out_channels
        self.kernel_size = self.conv.kernel_size
        self.stride = self.conv.stride
        self.padding = self.conv.padding
        self.dilation = self.conv.dilation
        self.transposed = self.conv.transposed
        self.output_padding = self.conv.output_padding
        self.groups = self.conv.groups

        if self.with_norm:
            if order.index('norm') > order.index('conv'):
                norm_channels = out_channels
            else:
                norm_channels = in_channels
            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
            self.add_module(self.norm_name, norm)
        else:
            self.norm_name = None

        if self.activation:
            self.act = act_layers(self.activation)

    @property
    def norm(self):
        if self.norm_name:
            return getattr(self, self.norm_name)
        else:
            return None

    def forward(self, x, norm=True):
        for layer in self.order:
            if layer == 'conv':
                x = self.conv(x)
            elif layer == 'norm' and norm and self.with_norm:
                x = self.norm(x)
            elif layer == 'act' and self.activation:
                x = self.act(x)
        return x


 class DepthwiseConvModule(nn.Module):

    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            stride=1,
            padding=0,
            dilation=1,
            bias='auto',
            norm_cfg=dict(type='BN'),
            activation='ReLU',
            inplace=True,
            order=('depthwise', 'dwnorm', 'act', 'pointwise', 'pwnorm', 'act'),
    ):
        super(DepthwiseConvModule, self).__init__()
        assert activation is None or isinstance(activation, str)
        self.activation = activation
        self.inplace = inplace
        self.order = order
        assert isinstance(self.order, tuple) and len(self.order) == 6
        assert set(order) == {
            'depthwise',
            'dwnorm',
            'act',
            'pointwise',
            'pwnorm',
            'act',
        }

        self.with_norm = norm_cfg is not None
        if bias == 'auto':
            bias = False if self.with_norm else True
        self.with_bias = bias

        if self.with_norm and self.with_bias:
            warnings.warn('ConvModule has norm and bias at the same time')

        self.depthwise = nn.Conv2d(
            in_channels,
            in_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=in_channels,
            bias=bias,
        )
        self.pointwise = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias)

        self.in_channels = self.depthwise.in_channels
        self.out_channels = self.pointwise.out_channels
        self.kernel_size = self.depthwise.kernel_size
        self.stride = self.depthwise.stride
        self.padding = self.depthwise.padding
        self.dilation = self.depthwise.dilation
        self.transposed = self.depthwise.transposed
        self.output_padding = self.depthwise.output_padding

        if self.with_norm:
            _, self.dwnorm = build_norm_layer(norm_cfg, in_channels)
            _, self.pwnorm = build_norm_layer(norm_cfg, out_channels)

        if self.activation:
            self.act = act_layers(self.activation)

    def forward(self, x, norm=True):
        for layer_name in self.order:
            if layer_name != 'act':
                layer = self.__getattr__(layer_name)
                x = layer(x)
            elif layer_name == 'act' and self.activation:
                x = self.act(x)
        return x
--- a/modelscope/models/cv/image_colorization/unet.py
+++ b/modelscope/models/cv/image_colorization/unet.py
@@ -1,3 +1,5 @@
 # The implementation here is modified based on DeOldify, originally MIT License
 # and publicly available at https://github.com/jantic/DeOldify/blob/master/deoldify/unet.py
 import numpy as np
 import torch
 import torch.nn as nn
--- a/modelscope/models/cv/image_colorization/utils.py
+++ b/modelscope/models/cv/image_colorization/utils.py
@@ -1,3 +1,5 @@
 # The implementation here is modified based on DeOldify, originally MIT License and
 # publicly available at https://github.com/jantic/DeOldify/blob/master/fastai/callbacks/hooks.py
 import functools
 from enum import Enum

--- a/modelscope/models/cv/image_portrait_enhancement/align_faces.py
+++ b/modelscope/models/cv/image_portrait_enhancement/align_faces.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from Face-Alignment,
 # publicly available at https://github.com/foamliu/Face-Alignment/blob/master/align_faces.py
 import cv2
 import numpy as np
 from skimage import transform as trans
--- a/modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py
+++ b/modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os

 import cv2
--- a/modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py
+++ b/modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from FaceQuality, made publicly available under the MIT License
 # at https://github.com/deepcam-cn/FaceQuality/blob/master/models/model_resnet.py
 import torch
 from torch import nn

--- a/modelscope/models/cv/image_portrait_enhancement/gpen.py
+++ b/modelscope/models/cv/image_portrait_enhancement/gpen.py
@@ -1,3 +1,5 @@
 # The GPEN implementation is also open-sourced by the authors,
 # and available at https://github.com/yangxy/GPEN/blob/main/face_model/gpen_model.py
 import functools
 import itertools
 import math
--- a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
+++ b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os.path as osp
 from copy import deepcopy
--- a/modelscope/models/cv/image_portrait_enhancement/losses/helpers.py
+++ b/modelscope/models/cv/image_portrait_enhancement/losses/helpers.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from InsightFace_Pytorch,
 # made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
 from collections import namedtuple

 import torch
--- a/modelscope/models/cv/image_portrait_enhancement/losses/losses.py
+++ b/modelscope/models/cv/image_portrait_enhancement/losses/losses.py
@@ -1,3 +1,5 @@
 # The GPEN implementation is also open-sourced by the authors,
 # and available at https://github.com/yangxy/GPEN/tree/main/training/loss/id_loss.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py
+++ b/modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from InsightFace_Pytorch,
 # made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                      Module, PReLU, Sequential)

--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
@@ -1,3 +1,5 @@
 # The GPEN implementation is also open-sourced by the authors,
 # and available at https://github.com/yangxy/GPEN/blob/main/face_detect/retinaface_detection.py
 import os

 import cv2
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
 # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py
 import time

 import torch
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
 # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py
 from collections import OrderedDict

 import torch
--- a/modelscope/models/cv/image_to_image_generation/model.py
+++ b/modelscope/models/cv/image_to_image_generation/model.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_generation/models/autoencoder.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_generation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_generation/models/clip.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/diffusion.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_generation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/losses.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_translation/data/transforms.py
+++ b/modelscope/models/cv/image_to_image_translation/data/transforms.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import random

--- a/modelscope/models/cv/image_to_image_translation/model_translation.py
+++ b/modelscope/models/cv/image_to_image_translation/model_translation.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
+++ b/modelscope/models/cv/image_to_image_translation/models/autoencoder.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_translation/models/clip.py
+++ b/modelscope/models/cv/image_to_image_translation/models/clip.py
@@ -1,3 +1,5 @@
 # Part of the implementation is borrowed and modified from CLIP, publicly avaialbe at https://github.com/openai/CLIP.
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_translation/ops/apps.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/apps.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # APPs that facilitate the use of pretrained neural networks.

 import os.path as osp
--- a/modelscope/models/cv/image_to_image_translation/ops/degradation.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/degradation.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import os
 import random
--- a/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/diffusion.py
@@ -1,3 +1,6 @@
 # Part of the implementation is borrowed and modified from latent-diffusion,
 # publicly avaialbe at https://github.com/CompVis/latent-diffusion.
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_translation/ops/losses.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/losses.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math

 import torch
--- a/modelscope/models/cv/image_to_image_translation/ops/metrics.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/metrics.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import numpy as np
 import scipy.linalg as linalg
 import torch
--- a/modelscope/models/cv/image_to_image_translation/ops/random_color.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_color.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import colorsys
 import random

--- a/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/random_mask.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import cv2
 import numpy as np

--- a/modelscope/models/cv/image_to_image_translation/ops/svd.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/svd.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 r"""SVD of linear degradation matrices described in the paper
    ``Denoising Diffusion Restoration Models.''
    @article{kawar2022denoising,
--- a/modelscope/models/cv/image_to_image_translation/ops/utils.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/utils.py
@@ -1,3 +1,4 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import base64
 import binascii
 import hashlib
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel):
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
        sampling_method = self.cfg.dataset.sampling_method.name
        self.neighbor_size = self.cfg.dataset.sampling_method.params[
            sampling_method].neighbor_size
@@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel):
        shot_num = len(sids)
        cnt = shot_num // bs + 1

        infer_sid, infer_pred = [], []
        infer_result = {}
        for i in range(cnt):
            start = i * bs
            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
@@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel):
            input_ = torch.stack(input_)
            outputs = self.shared_step(input_)  # shape [b,2]
            prob = F.softmax(outputs, dim=1)
            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
        self.infer_result['pred'] = np.stack(self.infer_result['pred'])
            infer_sid.extend(sid_.cpu().detach().numpy())
            infer_pred.extend(prob[:, 1].cpu().detach().numpy())
        infer_result.update({'pred': np.stack(infer_pred)})
        infer_result.update({'sid': infer_sid})

        assert len(self.infer_result['sid']) == len(sids)
        assert len(self.infer_result['pred']) == len(inputs)
        return self.infer_result
        assert len(infer_result['sid']) == len(sids)
        assert len(infer_result['pred']) == len(inputs)
        return infer_result

    def shared_step(self, inputs):
        with torch.no_grad():
@@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel):
        thres = self.cfg.pipeline.save_threshold

        anno_dict = get_pred_boundary(pred_dict, thres)
        scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
        scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
            self.shot2keyf, anno_dict)
        if self.cfg.pipeline.save_split_scene:
            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
            print(f'Split scene video saved to {re_dir}')
        return len(scene_list), scene_dict_lst
        return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst

    def preprocess(self, inputs):
        logger.info('Begin shot detect......')
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict):
    scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)

    scene_dict_lst = []
    shot_num = len(shot2keyf)
    shot_dict_lst = []
    for item in shot2keyf:
        tmp = item.split(' ')
        shot_dict_lst.append({
            'frame': [tmp[0], tmp[1]],
            'timestamps': [tmp[-2], tmp[-1]]
        })
    assert len(scene_list) == len(pair_list)
    for scene_ind, scene_item in enumerate(scene_list):
        scene_dict_lst.append({
            'shot': pair_list[scene_ind],
            'frame': scene_item[0],
            'timestamp': scene_item[1]
            'timestamps': scene_item[1]
        })

    return scene_dict_lst, scene_list
    return scene_dict_lst, scene_list, shot_num, shot_dict_lst


 def scene2video(source_movie_fn, scene_list, thres):
--- a/modelscope/models/cv/product_segmentation/init.py
+++ b/modelscope/models/cv/product_segmentation/init.py
@@ -0,0 +1,20 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .seg_infer import F3NetProductSegmentation

 else:
    _import_structure = {'seg_infer': ['F3NetProductSegmentation']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/product_segmentation/net.py
+++ b/modelscope/models/cv/product_segmentation/net.py
@@ -0,0 +1,197 @@
 # The implementation here is modified based on F3Net,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/weijun88/F3Net

 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class Bottleneck(nn.Module):

    def __init__(self,
                 inplanes,
                 planes,
                 stride=1,
                 downsample=None,
                 dilation=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(
            planes,
            planes,
            kernel_size=3,
            stride=stride,
            padding=(3 * dilation - 1) // 2,
            bias=False,
            dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.downsample = downsample

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)), inplace=True)
        out = F.relu(self.bn2(self.conv2(out)), inplace=True)
        out = self.bn3(self.conv3(out))
        if self.downsample is not None:
            x = self.downsample(x)
        return F.relu(out + x, inplace=True)


 class ResNet(nn.Module):

    def __init__(self):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(
            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self.make_layer(64, 3, stride=1, dilation=1)
        self.layer2 = self.make_layer(128, 4, stride=2, dilation=1)
        self.layer3 = self.make_layer(256, 6, stride=2, dilation=1)
        self.layer4 = self.make_layer(512, 3, stride=2, dilation=1)

    def make_layer(self, planes, blocks, stride, dilation):
        downsample = nn.Sequential(
            nn.Conv2d(
                self.inplanes,
                planes * 4,
                kernel_size=1,
                stride=stride,
                bias=False), nn.BatchNorm2d(planes * 4))
        layers = [
            Bottleneck(
                self.inplanes, planes, stride, downsample, dilation=dilation)
        ]
        self.inplanes = planes * 4
        for _ in range(1, blocks):
            layers.append(Bottleneck(self.inplanes, planes, dilation=dilation))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = x.reshape(1, 3, 448, 448)
        out1 = F.relu(self.bn1(self.conv1(x)), inplace=True)
        out1 = F.max_pool2d(out1, kernel_size=3, stride=2, padding=1)
        out2 = self.layer1(out1)
        out3 = self.layer2(out2)
        out4 = self.layer3(out3)
        out5 = self.layer4(out4)
        return out2, out3, out4, out5


 class CFM(nn.Module):

    def __init__(self):
        super(CFM, self).__init__()
        self.conv1h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn1h = nn.BatchNorm2d(64)
        self.conv2h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn2h = nn.BatchNorm2d(64)
        self.conv3h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn3h = nn.BatchNorm2d(64)
        self.conv4h = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn4h = nn.BatchNorm2d(64)

        self.conv1v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn1v = nn.BatchNorm2d(64)
        self.conv2v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn2v = nn.BatchNorm2d(64)
        self.conv3v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn3v = nn.BatchNorm2d(64)
        self.conv4v = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.bn4v = nn.BatchNorm2d(64)

    def forward(self, left, down):
        if down.size()[2:] != left.size()[2:]:
            down = F.interpolate(down, size=left.size()[2:], mode='bilinear')
        out1h = F.relu(self.bn1h(self.conv1h(left)), inplace=True)
        out2h = F.relu(self.bn2h(self.conv2h(out1h)), inplace=True)
        out1v = F.relu(self.bn1v(self.conv1v(down)), inplace=True)
        out2v = F.relu(self.bn2v(self.conv2v(out1v)), inplace=True)
        fuse = out2h * out2v
        out3h = F.relu(self.bn3h(self.conv3h(fuse)), inplace=True) + out1h
        out4h = F.relu(self.bn4h(self.conv4h(out3h)), inplace=True)
        out3v = F.relu(self.bn3v(self.conv3v(fuse)), inplace=True) + out1v
        out4v = F.relu(self.bn4v(self.conv4v(out3v)), inplace=True)
        return out4h, out4v


 class Decoder(nn.Module):

    def __init__(self):
        super(Decoder, self).__init__()
        self.cfm45 = CFM()
        self.cfm34 = CFM()
        self.cfm23 = CFM()

    def forward(self, out2h, out3h, out4h, out5v, fback=None):
        if fback is not None:
            refine5 = F.interpolate(
                fback, size=out5v.size()[2:], mode='bilinear')
            refine4 = F.interpolate(
                fback, size=out4h.size()[2:], mode='bilinear')
            refine3 = F.interpolate(
                fback, size=out3h.size()[2:], mode='bilinear')
            refine2 = F.interpolate(
                fback, size=out2h.size()[2:], mode='bilinear')
            out5v = out5v + refine5
            out4h, out4v = self.cfm45(out4h + refine4, out5v)
            out3h, out3v = self.cfm34(out3h + refine3, out4v)
            out2h, pred = self.cfm23(out2h + refine2, out3v)
        else:
            out4h, out4v = self.cfm45(out4h, out5v)
            out3h, out3v = self.cfm34(out3h, out4v)
            out2h, pred = self.cfm23(out2h, out3v)
        return out2h, out3h, out4h, out5v, pred


 class F3Net(nn.Module):

    def __init__(self):
        super(F3Net, self).__init__()
        self.bkbone = ResNet()
        self.squeeze5 = nn.Sequential(
            nn.Conv2d(2048, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
        self.squeeze4 = nn.Sequential(
            nn.Conv2d(1024, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
        self.squeeze3 = nn.Sequential(
            nn.Conv2d(512, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))
        self.squeeze2 = nn.Sequential(
            nn.Conv2d(256, 64, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True))

        self.decoder1 = Decoder()
        self.decoder2 = Decoder()
        self.linearp1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
        self.linearp2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)

        self.linearr2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
        self.linearr3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
        self.linearr4 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
        self.linearr5 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)

    def forward(self, x, shape=None):
        x = x.reshape(1, 3, 448, 448)
        out2h, out3h, out4h, out5v = self.bkbone(x)
        out2h, out3h, out4h, out5v = self.squeeze2(out2h), self.squeeze3(
            out3h), self.squeeze4(out4h), self.squeeze5(out5v)
        out2h, out3h, out4h, out5v, pred1 = self.decoder1(
            out2h, out3h, out4h, out5v)
        out2h, out3h, out4h, out5v, pred2 = self.decoder2(
            out2h, out3h, out4h, out5v, pred1)

        shape = x.size()[2:] if shape is None else shape
        pred1 = F.interpolate(
            self.linearp1(pred1), size=shape, mode='bilinear')
        pred2 = F.interpolate(
            self.linearp2(pred2), size=shape, mode='bilinear')

        out2h = F.interpolate(
            self.linearr2(out2h), size=shape, mode='bilinear')
        out3h = F.interpolate(
            self.linearr3(out3h), size=shape, mode='bilinear')
        out4h = F.interpolate(
            self.linearr4(out4h), size=shape, mode='bilinear')
        out5h = F.interpolate(
            self.linearr5(out5v), size=shape, mode='bilinear')
        return pred1, pred2, out2h, out3h, out4h, out5h
--- a/modelscope/models/cv/product_segmentation/seg_infer.py
+++ b/modelscope/models/cv/product_segmentation/seg_infer.py
@@ -0,0 +1,77 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.

 import cv2
 import numpy as np
 import torch
 from PIL import Image

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from .net import F3Net

 logger = get_logger()


 def load_state_dict(model_dir, device):
    _dict = torch.load(
        '{}/{}'.format(model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
        map_location=device)
    state_dict = {}
    for k, v in _dict.items():
        if k.startswith('module'):
            k = k[7:]
        state_dict[k] = v
    return state_dict


@MODELS.register_module(
    Tasks.product_segmentation, module_name=Models.product_segmentation)
 class F3NetForProductSegmentation(TorchModel):

    def __init__(self, model_dir, device_id=0, *args, **kwargs):

        super().__init__(
            model_dir=model_dir, device_id=device_id, *args, **kwargs)

        self.model = F3Net()
        if torch.cuda.is_available():
            self.device = 'cuda'
            logger.info('Use GPU')
        else:
            self.device = 'cpu'
            logger.info('Use CPU')

        self.params = load_state_dict(model_dir, self.device)
        self.model.load_state_dict(self.params)
        self.model.to(self.device)
        self.model.eval()
        self.model.to(self.device)

    def forward(self, x):
        pred_result = self.model(x)
        return pred_result


 mean, std = np.array([[[124.55, 118.90,
                        102.94]]]), np.array([[[56.77, 55.97, 57.50]]])


 def inference(model, device, input_path):
    img = Image.open(input_path)
    img = np.array(img.convert('RGB')).astype(np.float32)
    img = (img - mean) / std
    img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR)
    img = torch.from_numpy(img)
    img = img.permute(2, 0, 1)
    img = img.to(device).float()
    outputs = model(img)
    out = outputs[0]
    pred = (torch.sigmoid(out[0, 0]) * 255).cpu().numpy()
    pred[pred < 20] = 0
    pred = pred[:, :, np.newaxis]
    pred = np.round(pred)
    logger.info('Inference Done')
    return pred
--- a/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
+++ b/modelscope/models/cv/skin_retouching/detection_model/detection_module.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn

--- a/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
+++ b/modelscope/models/cv/skin_retouching/detection_model/detection_unet_in.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
+++ b/modelscope/models/cv/skin_retouching/inpainting_model/gconv.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn

--- a/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
+++ b/modelscope/models/cv/skin_retouching/inpainting_model/inpainting_unet.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/cv/skin_retouching/unet_deploy.py
+++ b/modelscope/models/cv/skin_retouching/unet_deploy.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import warnings

 import torch
--- a/modelscope/models/cv/skin_retouching/utils.py
+++ b/modelscope/models/cv/skin_retouching/utils.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import time
 from typing import Dict, List, Optional, Tuple, Union

--- a/modelscope/models/cv/skin_retouching/weights_init.py
+++ b/modelscope/models/cv/skin_retouching/weights_init.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn

--- a/modelscope/models/cv/super_resolution/arch_util.py
+++ b/modelscope/models/cv/super_resolution/arch_util.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from BasicSR, made public available under the Apache 2.0 License
 # at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/arch_util.py
 import collections.abc
 import math
 import warnings
--- a/modelscope/models/cv/super_resolution/rrdbnet_arch.py
+++ b/modelscope/models/cv/super_resolution/rrdbnet_arch.py
@@ -1,3 +1,5 @@
 # The implementation is adopted from BasicSR, made public available under the Apache 2.0 License
 # at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/rrdbnet_arch.py
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
--- a/modelscope/models/multi_modal/clip/init.py
+++ b/modelscope/models/multi_modal/clip/init.py
@@ -1 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .model import CLIPForMultiModalEmbedding
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -1,3 +1,18 @@
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import os
 from collections import OrderedDict
 from typing import Any, Dict, Iterable, List, Tuple, Union
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -543,6 +543,7 @@ class GEMMModel(nn.Module):
        img_feature, text_feature, caption = None, None, None
        if captioning and image is not None:
            img_feature, caption = self.model.image_to_text(image)
            img_feature = self.parse_feat(img_feature)
        elif image is not None:
            img_feature = self.parse_feat(self.model.encode_image(image))
        if text is not None:
--- a/modelscope/models/multi_modal/gemm/gemm_model.py
+++ b/modelscope/models/multi_modal/gemm/gemm_model.py
@@ -67,7 +67,7 @@ class GEMMForMultiModalEmbedding(TorchModel):
        return img_tensor

    def parse_text(self, text_str):
        if text_str is None:
        if text_str is None or len(text_str) == 0:
            return None
        if isinstance(text_str, str):
            text_ids_tensor = self.gemm_model.tokenize(text_str)
@@ -79,9 +79,12 @@ class GEMMForMultiModalEmbedding(TorchModel):
        return text_ids_tensor.view(1, -1)

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        image = self.parse_image(input.get('image', input.get('img', None)))
        text = self.parse_text(input.get('text', input.get('txt', None)))
        captioning = input.get('captioning', False) is True
        image_input = input.get('image', input.get('img', None))
        text_input = input.get('text', input.get('txt', None))
        captioning_input = input.get('captioning', None)
        image = self.parse_image(image_input)
        text = self.parse_text(text_input)
        captioning = captioning_input is True or text_input == ''
        out = self.gemm_model(image, text, captioning)
        output = {
            OutputKeys.IMG_EMBEDDING: out.get('image_feature', None),
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,4 +1,4 @@
 # The implementation is adopated from the CLIP4Clip implementation,
 # The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip

 import random
--- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
+++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
@@ -1,4 +1,4 @@
 # The implementation is adopated from the CLIP4Clip implementation,
 # The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip

 import numpy as np
--- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
@@ -1,4 +1,4 @@
 # The implementation is adopated from the CLIP4Clip implementation,
 # The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip

 import gzip
--- a/modelscope/models/multi_modal/ofa/init.py
+++ b/modelscope/models/multi_modal/ofa/init.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel
 from .tokenization_ofa import OFATokenizer, OFATokenizerZH
 from .tokenization_ofa_fast import OFATokenizerFast, OFATokenizerZHFast
--- a/modelscope/models/multi_modal/ofa/resnet.py
+++ b/modelscope/models/multi_modal/ofa/resnet.py
@@ -1,3 +1,17 @@
 # Copyright 2022 OFA-Sys Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import torch
 import torch.nn as nn

--- a/modelscope/models/multi_modal/ofa/utils/init.py
+++ b/modelscope/models/multi_modal/ofa/utils/init.py
@@ -1 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .constant import OFA_TASK_KEY_MAPPING
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 from typing import Any, Dict

--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -21,6 +21,7 @@ class OutputKeys(object):
    POLYGONS = 'polygons'
    OUTPUT = 'output'
    OUTPUT_IMG = 'output_img'
    OUTPUT_VIDEO = 'output_video'
    OUTPUT_PCM = 'output_pcm'
    IMG_EMBEDDING = 'img_embedding'
    SPO_LIST = 'spo_list'
@@ -37,8 +38,10 @@ class OutputKeys(object):
    KWS_LIST = 'kws_list'
    HISTORY = 'history'
    TIMESTAMPS = 'timestamps'
    SPLIT_VIDEO_NUM = 'split_video_num'
    SPLIT_META_LIST = 'split_meta_list'
    SHOT_NUM = 'shot_num'
    SCENE_NUM = 'scene_num'
    SCENE_META_LIST = 'scene_meta_list'
    SHOT_META_LIST = 'shot_meta_list'


 TASK_OUTPUTS = {
@@ -218,13 +221,21 @@ TASK_OUTPUTS = {

    # 3D human body keypoints detection result for single sample
    # {
    #   "poses": [
    #               [[x, y, z]*17],
    #               [[x, y, z]*17],
    #               [[x, y, z]*17]
    #             ]
    #   "poses": [		    # 3d pose coordinate in camera coordinate
    #     	[[x, y, z]*17],	# joints of per image
    #     	[[x, y, z]*17],
    #     	...
    #     ],
    #   "timestamps": [     # timestamps of all frames
    #     "00:00:0.230",
    #     "00:00:0.560",
    #     "00:00:0.690",
    #   ],
    #   "output_video": "path_to_rendered_video" , this is optional
    # and is only avaialbe when the "render" option is enabled.
    # }
    Tasks.body_3d_keypoints: [OutputKeys.POSES],
    Tasks.body_3d_keypoints:
    [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],

    # 2D hand keypoints result for single sample
    # {
@@ -300,19 +311,30 @@ TASK_OUTPUTS = {
    Tasks.shop_segmentation: [OutputKeys.MASKS],
    # movide scene segmentation result for a single video
    # {
    #        "split_video_num":3,
    #        "split_meta_list":
    #        "shot_num":15,
    #        "shot_meta_list":
    #        [
    #           {
    #               "frame": [start_frame, end_frame],
    #               "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
    #
    #           }
    #         ]
    #        "scene_num":3,
    #        "scene_meta_list":
    #        [
    #           {
    #               "shot": [0,1,2],
    #               "frame": [start_frame, end_frame],
    #               "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
    #               "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
    #           }
    #        ]
    #
    # }
    Tasks.movie_scene_segmentation:
    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
    Tasks.movie_scene_segmentation: [
        OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM,
        OutputKeys.SCENE_META_LIST
    ],

    # ============ nlp tasks ===================

@@ -649,8 +671,28 @@ TASK_OUTPUTS = {
    #     'output': ['Done' / 'Decode_Error']
    # }
    Tasks.video_inpainting: [OutputKeys.OUTPUT],

    # {
    #     'output': ['bixin']
    # }
    Tasks.hand_static: [OutputKeys.OUTPUT]
    Tasks.hand_static: [OutputKeys.OUTPUT],

    #     'output': [
    #                [2, 75, 287, 240, 510, 0.8335018754005432],
    #                [1, 127, 83, 332, 366, 0.9175254702568054],
    #                [0, 0, 0, 367, 639, 0.9693422317504883]]
    # }
    Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],

    # {
    #   {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
    # }
    Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES],

    # {
    #     "masks": [
    #           np.array # 2D array containing only 0, 255
    #       ]
    # }
    Tasks.product_segmentation: [OutputKeys.MASKS],
 }