Merge remote-tracking branch 'origin/master' into ofa/finetune

3 years ago · 5cf6910bed
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,11 +1,9 @@
 pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/tests.txt
 # install numpy<=1.18 for tensorflow==1.15.x
 pip install "numpy<=1.18"

 git config --global --add safe.directory /Maas-lib

@@ -26,4 +24,3 @@ else
 fi
 echo "Running case with command: $ci_command"
 $ci_command
 #python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py
--- a/.dev_scripts/citest.sh
+++ b/.dev_scripts/citest.sh
@@ -1,19 +0,0 @@
 pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html

 pip install -r requirements/tests.txt
 # install numpy<=1.18 for tensorflow==1.15.x
 pip install "numpy<=1.18"

 # linter test
 # use internal project for pre-commit due to the network problem
 pre-commit run --all-files
 if [ $? -ne 0 ]; then
    echo "linter test failed, please run 'pre-commit run --all-files' to check"
    exit -1
 fi

 PYTHONPATH=. python tests/run.py
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
 CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
 # export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
 CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
 echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
@@ -16,6 +17,7 @@ do
  echo "get gpu lock $gpu"
  CONTAINER_NAME="modelscope-ci-$gpu"
  let is_get_file_lock=true

  # pull image if there are update
  docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
  docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
@@ -38,6 +40,7 @@ do
             --net host  \
             ${IMAGE_NAME}:${IMAGE_VERSION} \
             $CI_COMMAND

  if [ $? -ne 0 ]; then
    echo "Running test case failed, please check the log!"
    exit -1
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -25,4 +25,4 @@ python:
  install:
    - requirements: requirements/docs.txt
    - requirements: requirements/readthedocs.txt
    - requirements: requirements/runtime.txt
    - requirements: requirements/framework.txt
--- a/data/test/images/facial_expression_recognition.jpg
+++ b/data/test/images/facial_expression_recognition.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633
 size 161535
--- a/data/test/images/hand_keypoints.jpg
+++ b/data/test/images/hand_keypoints.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988
 size 7750
--- a/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
+++ b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766
 size 54390
--- a/data/test/images/retina_face_detection.jpg
+++ b/data/test/images/retina_face_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
 size 87228
--- a/data/test/images/shop_segmentation.jpg
+++ b/data/test/images/shop_segmentation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0
 size 238607
--- a/data/test/images/text_driven_segmentation.jpg
+++ b/data/test/images/text_driven_segmentation.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4
 size 67861
--- a/data/test/videos/action_detection_test_video.mp4
+++ b/data/test/videos/action_detection_test_video.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92
 size 3191059
--- a/data/test/videos/movie_scene_segmentation_test_video.mp4
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
 size 126815483
 oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443
 size 12722029
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -64,7 +64,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
 # install modelscope
 COPY requirements /var/modelscope
 RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
    pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -9,6 +9,8 @@ class Models(object):

        Model name should only contain model info but not task info.
    """
    tinynas_detection = 'tinynas-detection'

    # vision models
    detection = 'detection'
    realtime_object_detection = 'realtime-object-detection'
@@ -22,12 +24,17 @@ class Models(object):
    body_2d_keypoints = 'body-2d-keypoints'
    body_3d_keypoints = 'body-3d-keypoints'
    crowd_counting = 'HRNetCrowdCounting'
    face_2d_keypoints = 'face-2d-keypoints'
    panoptic_segmentation = 'swinL-panoptic-segmentation'
    image_reid_person = 'passvitb'
    video_summarization = 'pgl-video-summarization'
    swinL_semantic_segmentation = 'swinL-semantic-segmentation'
    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
    text_driven_segmentation = 'text-driven-segmentation'
    resnet50_bert = 'resnet50-bert'
    fer = 'fer'
    retinaface = 'retinaface'
    shop_segmentation = 'shop-segmentation'

    # EasyCV models
    yolox = 'YOLOX'
@@ -37,6 +44,7 @@ class Models(object):
    bert = 'bert'
    palm = 'palm-v2'
    structbert = 'structbert'
    deberta_v2 = 'deberta_v2'
    veco = 'veco'
    translation = 'csanmt-translation'
    space_dst = 'space-dst'
@@ -104,13 +112,17 @@ class Pipelines(object):
    hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
    body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
    body_3d_keypoints = 'canonical_body-3d-keypoints_video'
    hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image'
    human_detection = 'resnet18-human-detection'
    object_detection = 'vit-object-detection'
    easycv_detection = 'easycv-detection'
    easycv_segmentation = 'easycv-segmentation'
    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
    salient_detection = 'u2net-salient-detection'
    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
    retina_face_detection = 'resnet50-face-detection-retinaface'
    live_category = 'live-category'
    general_image_classification = 'vit-base_image-classification_ImageNet-labels'
    daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
@@ -132,13 +144,17 @@ class Pipelines(object):
    image_to_image_generation = 'image-to-image-generation'
    skin_retouching = 'unet-skin-retouching'
    tinynas_classification = 'tinynas-classification'
    tinynas_detection = 'tinynas-detection'
    crowd_counting = 'hrnet-crowd-counting'
    action_detection = 'ResNetC3D-action-detection'
    video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
    image_panoptic_segmentation = 'image-panoptic-segmentation'
    video_summarization = 'googlenet_pgl_video_summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
    image_reid_person = 'passvitb-image-reid-person'
    text_driven_segmentation = 'text-driven-segmentation'
    movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
    shop_segmentation = 'shop-segmentation'

    # nlp tasks
    sentence_similarity = 'sentence-similarity'
@@ -347,6 +363,7 @@ class Datasets(object):
    """ Names for different datasets.
    """
    ClsDataset = 'ClsDataset'
    Face2dKeypointsDataset = 'Face2dKeypointsDataset'
    SegDataset = 'SegDataset'
    DetDataset = 'DetDataset'
    DetImagesMixDataset = 'DetImagesMixDataset'
--- a/modelscope/models/audio/ans/init.py
+++ b/modelscope/models/audio/ans/init.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .frcrn import FRCRNModel
    from .frcrn import FRCRNDecorator

 else:
    _import_structure = {
        'frcrn': ['FRCRNModel'],
        'frcrn': ['FRCRNDecorator'],
    }

    import sys
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,3 +1,9 @@
 """
 The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
 here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
 and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch

 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
--- a/modelscope/models/audio/ans/conv_stft.py
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Dict

@@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT
 from .unet import UNet


 class FTB(nn.Module):

    def __init__(self, input_dim=257, in_channel=9, r_channel=5):

        super(FTB, self).__init__()
        self.in_channel = in_channel
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
            nn.BatchNorm2d(r_channel), nn.ReLU())

        self.conv1d = nn.Sequential(
            nn.Conv1d(
                r_channel * input_dim, in_channel, kernel_size=9, padding=4),
            nn.BatchNorm1d(in_channel), nn.ReLU())
        self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
            nn.BatchNorm2d(in_channel), nn.ReLU())

    def forward(self, inputs):
        '''
        inputs should be [Batch, Ca, Dim, Time]
        '''
        # T-F attention
        conv1_out = self.conv1(inputs)
        B, C, D, T = conv1_out.size()
        reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
        conv1d_out = self.conv1d(reshape1_out)
        conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])

        # now is also [B,C,D,T]
        att_out = conv1d_out * inputs

        # tranpose to [B,C,T,D]
        att_out = torch.transpose(att_out, 2, 3)
        freqfc_out = self.freq_fc(att_out)
        att_out = torch.transpose(freqfc_out, 2, 3)

        cat_out = torch.cat([att_out, inputs], 1)
        outputs = self.conv2(cat_out)
        return outputs


@MODELS.register_module(
    Tasks.acoustic_noise_suppression,
    module_name=Models.speech_frcrn_ans_cirm_16k)
 class FRCRNModel(TorchModel):
 class FRCRNDecorator(TorchModel):
    r""" A decorator of FRCRN for integrating into modelscope framework """

    def __init__(self, model_dir: str, *args, **kwargs):
@@ -78,13 +35,14 @@ class FRCRNModel(TorchModel):
            checkpoint = torch.load(
                model_bin_file, map_location=torch.device('cpu'))
            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
                self.model.load_state_dict(
                    checkpoint['state_dict'], strict=False)
                # the new trained model by user is based on FRCRNDecorator
                self.load_state_dict(checkpoint['state_dict'])
            else:
                # The released model on Modelscope is based on FRCRN
                self.model.load_state_dict(checkpoint, strict=False)

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        result_list = self.model.forward(input['noisy'])
    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
        result_list = self.model.forward(inputs['noisy'])
        output = {
            'spec_l1': result_list[0],
            'wav_l1': result_list[1],
@@ -93,12 +51,12 @@ class FRCRNModel(TorchModel):
            'wav_l2': result_list[4],
            'mask_l2': result_list[5]
        }
        if 'clean' in input:
        if 'clean' in inputs:
            mix_result = self.model.loss(
                input['noisy'], input['clean'], result_list, mode='Mix')
                inputs['noisy'], inputs['clean'], result_list, mode='Mix')
            output.update(mix_result)
            sisnr_result = self.model.loss(
                input['noisy'], input['clean'], result_list, mode='SiSNR')
                inputs['noisy'], inputs['clean'], result_list, mode='SiSNR')
            output.update(sisnr_result)
            # logger hooker will use items under 'log_vars'
            output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
--- a/modelscope/models/audio/ans/se_module_complex.py
+++ b/modelscope/models/audio/ans/se_module_complex.py
@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 from torch import nn

--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,3 +1,8 @@
 """
 The implementation here is modified based on
 Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
 and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 """
 import torch
 import torch.nn as nn

--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -3,15 +3,15 @@
 # yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
               body_3d_keypoints, cartoon, cmdssl_video_embedding,
               crowd_counting, face_detection, face_generation,
               image_classification, image_color_enhance, image_colorization,
               image_denoise, image_instance_segmentation,
               crowd_counting, face_2d_keypoints, face_detection,
               face_generation, image_classification, image_color_enhance,
               image_colorization, image_denoise, image_instance_segmentation,
               image_panoptic_segmentation, image_portrait_enhancement,
               image_reid_person, image_semantic_segmentation,
               image_to_image_generation, image_to_image_translation,
               movie_scene_segmentation, object_detection,
               product_retrieval_embedding, realtime_object_detection,
               salient_detection, super_resolution,
               salient_detection, shop_segmentation, super_resolution,
               video_single_object_tracking, video_summarization, virual_tryon)

 # yapf: enable
--- a/modelscope/models/cv/action_detection/init.py
+++ b/modelscope/models/cv/action_detection/init.py
@@ -0,0 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:

    from .action_detection_onnx import ActionDetONNX

 else:
    _import_structure = {'action_detection_onnx': ['ActionDetONNX']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/action_detection/action_detection_onnx.py
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -0,0 +1,177 @@
 import os
 import os.path as osp
 import shutil
 import subprocess

 import cv2
 import numpy as np
 import onnxruntime as rt

 from modelscope.models import Model
 from modelscope.utils.constant import Devices
 from modelscope.utils.device import verify_device


 class ActionDetONNX(Model):

    def __init__(self, model_dir, config, *args, **kwargs):
        super().__init__(self, model_dir, *args, **kwargs)
        model_file = osp.join(config['model_file'])
        device_type, device_id = verify_device(self._device_name)
        options = rt.SessionOptions()
        options.intra_op_num_threads = 1
        options.inter_op_num_threads = 1
        if device_type == Devices.gpu:
            sess = rt.InferenceSession(
                model_file,
                providers=['CUDAExecutionProvider'],
                sess_options=options,
                provider_options=[{
                    'device_id': device_id
                }])
        else:
            sess = rt.InferenceSession(
                model_file,
                providers=['CPUExecutionProvider'],
                sess_options=options)
        self.input_name = sess.get_inputs()[0].name
        self.sess = sess
        self.num_stride = len(config['fpn_strides'])
        self.score_thresh = np.asarray(
            config['pre_nms_thresh'], dtype='float32').reshape((1, -1))
        self.size_divisibility = config['size_divisibility']
        self.nms_threshold = config['nms_thresh']
        self.tmp_dir = config['tmp_dir']
        self.temporal_stride = config['step']
        self.input_data_type = config['input_type']
        self.action_names = config['action_names']
        self.video_length_limit = config['video_length_limit']

    def resize_box(self, det, height, width, scale_h, scale_w):
        bboxs = det[0]
        bboxs[:, [0, 2]] *= scale_w
        bboxs[:, [1, 3]] *= scale_h
        bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1)
        bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1)
        result = {
            'boxes': bboxs.round().astype('int32').tolist(),
            'scores': det[1].tolist(),
            'labels': [self.action_names[i] for i in det[2].tolist()]
        }
        return result

    def parse_frames(self, frame_names):
        imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names]
        imgs = np.stack(imgs).astype(self.input_data_type).transpose(
            (3, 0, 1, 2))  # c,t,h,w
        imgs = imgs[None]
        return imgs

    def forward_img(self, imgs, h, w):
        pred = self.sess.run(None, {
            self.input_name: imgs,
            'height': np.asarray(h),
            'width': np.asarray(w)
        })
        dets = self.post_nms(
            pred,
            score_threshold=self.score_thresh,
            nms_threshold=self.nms_threshold)
        return dets

    def forward_video(self, video_name, scale):
        min_size, max_size = self._get_sizes(scale)

        tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4])
        if osp.exists(tmp_dir):
            shutil.rmtree(tmp_dir)
        os.makedirs(tmp_dir)
        frame_rate = 2
        cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
              f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'

        cmd = cmd.split(' ')
        subprocess.call(cmd)

        frame_names = [
            osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
            if name.endswith('.jpg')
        ]
        frame_names = [
            frame_names[i:i + frame_rate * 2]
            for i in range(0,
                           len(frame_names) - frame_rate * 2 + 1, frame_rate
                           * self.temporal_stride)
        ]
        timestamp = list(
            range(1,
                  len(frame_names) * self.temporal_stride,
                  self.temporal_stride))
        batch_imgs = [self.parse_frames(names) for names in frame_names]

        N, _, T, H, W = batch_imgs[0].shape
        scale_min = min_size / min(H, W)
        h, w = min(int(scale_min * H),
                   max_size), min(int(scale_min * W), max_size)
        h = round(h / self.size_divisibility) * self.size_divisibility
        w = round(w / self.size_divisibility) * self.size_divisibility
        scale_h, scale_w = H / h, W / w

        results = []
        for imgs in batch_imgs:
            det = self.forward_img(imgs, h, w)
            det = self.resize_box(det[0], H, W, scale_h, scale_w)
            results.append(det)
        results = [{
            'timestamp': t,
            'actions': res
        } for t, res in zip(timestamp, results)]
        shutil.rmtree(tmp_dir)
        return results

    def forward(self, video_name):
        return self.forward_video(video_name, scale=1)

    def post_nms(self, pred, score_threshold, nms_threshold=0.3):
        pred_bboxes, pred_scores = pred
        N = len(pred_bboxes)
        dets = []
        for i in range(N):
            bboxes, scores = pred_bboxes[i], pred_scores[i]
            candidate_inds = scores > score_threshold
            scores = scores[candidate_inds]
            candidate_nonzeros = candidate_inds.nonzero()
            bboxes = bboxes[candidate_nonzeros[0]]
            labels = candidate_nonzeros[1]
            keep = self._nms(bboxes, scores, labels, nms_threshold)
            bbox = bboxes[keep]
            score = scores[keep]
            label = labels[keep]
            dets.append((bbox, score, label))
        return dets

    def _nms(self, boxes, scores, idxs, nms_threshold):
        if len(boxes) == 0:
            return []
        max_coordinate = boxes.max()
        offsets = idxs * (max_coordinate + 1)
        boxes_for_nms = boxes + offsets[:, None].astype('float32')
        boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0]
        boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1]
        keep = cv2.dnn.NMSBoxes(
            boxes_for_nms.tolist(),
            scores.tolist(),
            score_threshold=0,
            nms_threshold=nms_threshold)
        if len(keep.shape) == 2:
            keep = np.squeeze(keep, 1)
        return keep

    def _get_sizes(self, scale):
        if scale == 1:
            min_size, max_size = 512, 896
        elif scale == 2:
            min_size, max_size = 768, 1280
        else:
            min_size, max_size = 1024, 1792
        return min_size, max_size
--- a/modelscope/models/cv/face_2d_keypoints/init.py
+++ b/modelscope/models/cv/face_2d_keypoints/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .face_2d_keypoints_align import Face2DKeypoints

 else:
    _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
+++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.face.face_keypoint import FaceKeypoint

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks


@MODELS.register_module(
    group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
 class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):

    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        FaceKeypoint.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/face_detection/init.py
+++ b/modelscope/models/cv/face_detection/init.py
@@ -0,0 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .retinaface import RetinaFaceDetection

 else:
    _import_structure = {
        'retinaface': ['RetinaFaceDetection'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/face_detection/retinaface/init.py
+++ b/modelscope/models/cv/face_detection/retinaface/init.py
@@ -0,0 +1 @@
 from .detection import RetinaFaceDetection
--- a/modelscope/models/cv/face_detection/retinaface/detection.py
+++ b/modelscope/models/cv/face_detection/retinaface/detection.py
@@ -0,0 +1,137 @@
 # The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
 import cv2
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .models.retinaface import RetinaFace
 from .utils import PriorBox, decode, decode_landm, py_cpu_nms


@MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface)
 class RetinaFaceDetection(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.cfg = Config.from_file(
            model_path.replace(ModelFile.TORCH_MODEL_FILE,
                               ModelFile.CONFIGURATION))['models']
        self.net = RetinaFace(cfg=self.cfg)
        self.load_model()
        self.device = device
        self.net = self.net.to(self.device)

        self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device)

    def check_keys(self, pretrained_state_dict):
        ckpt_keys = set(pretrained_state_dict.keys())
        model_keys = set(self.net.state_dict().keys())
        used_pretrained_keys = model_keys & ckpt_keys
        assert len(
            used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
        return True

    def remove_prefix(self, state_dict, prefix):
        new_state_dict = dict()
        for k, v in state_dict.items():
            if k.startswith(prefix):
                new_state_dict[k[len(prefix):]] = v
            else:
                new_state_dict[k] = v
        return new_state_dict

    def load_model(self, load_to_cpu=False):
        pretrained_dict = torch.load(
            self.model_path, map_location=torch.device('cpu'))
        if 'state_dict' in pretrained_dict.keys():
            pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'],
                                                 'module.')
        else:
            pretrained_dict = self.remove_prefix(pretrained_dict, 'module.')
        self.check_keys(pretrained_dict)
        self.net.load_state_dict(pretrained_dict, strict=False)
        self.net.eval()

    def forward(self, input):
        img_raw = input['img'].cpu().numpy()
        img = np.float32(img_raw)

        im_height, im_width = img.shape[:2]
        ss = 1.0
        # tricky
        if max(im_height, im_width) > 1500:
            ss = 1000.0 / max(im_height, im_width)
            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
            im_height, im_width = img.shape[:2]

        scale = torch.Tensor(
            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
        img -= (104, 117, 123)
        img = img.transpose(2, 0, 1)
        img = torch.from_numpy(img).unsqueeze(0)
        img = img.to(self.device)
        scale = scale.to(self.device)

        loc, conf, landms = self.net(img)  # forward pass
        del img

        confidence_threshold = 0.9
        nms_threshold = 0.4
        top_k = 5000
        keep_top_k = 750

        priorbox = PriorBox(self.cfg, image_size=(im_height, im_width))
        priors = priorbox.forward()
        priors = priors.to(self.device)
        prior_data = priors.data
        boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance'])
        boxes = boxes * scale
        boxes = boxes.cpu().numpy()
        scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
        landms = decode_landm(
            landms.data.squeeze(0), prior_data, self.cfg['variance'])
        scale1 = torch.Tensor([
            im_width, im_height, im_width, im_height, im_width, im_height,
            im_width, im_height, im_width, im_height
        ])
        scale1 = scale1.to(self.device)
        landms = landms * scale1
        landms = landms.cpu().numpy()

        # ignore low scores
        inds = np.where(scores > confidence_threshold)[0]
        boxes = boxes[inds]
        landms = landms[inds]
        scores = scores[inds]

        # keep top-K before NMS
        order = scores.argsort()[::-1][:top_k]
        boxes = boxes[order]
        landms = landms[order]
        scores = scores[order]

        # do NMS
        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
            np.float32, copy=False)
        keep = py_cpu_nms(dets, nms_threshold)
        dets = dets[keep, :]
        landms = landms[keep]

        # keep top-K faster NMS
        dets = dets[:keep_top_k, :]
        landms = landms[:keep_top_k, :]

        landms = landms.reshape((-1, 5, 2))
        landms = landms.reshape(
            -1,
            10,
        )
        return dets / ss, landms / ss
--- a/modelscope/models/cv/face_detection/retinaface/models/init.py
+++ b/modelscope/models/cv/face_detection/retinaface/models/init.py
--- a/modelscope/models/cv/face_detection/retinaface/models/net.py
+++ b/modelscope/models/cv/face_detection/retinaface/models/net.py
@@ -0,0 +1,149 @@
 # The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
 import time

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.models as models
 import torchvision.models._utils as _utils
 from torch.autograd import Variable


 def conv_bn(inp, oup, stride=1, leaky=0):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup),
        nn.LeakyReLU(negative_slope=leaky, inplace=True))


 def conv_bn_no_relu(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
    )


 def conv_bn1X1(inp, oup, stride, leaky=0):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
        nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True))


 def conv_dw(inp, oup, stride, leaky=0.1):
    return nn.Sequential(
        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
        nn.BatchNorm2d(inp),
        nn.LeakyReLU(negative_slope=leaky, inplace=True),
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.LeakyReLU(negative_slope=leaky, inplace=True),
    )


 class SSH(nn.Module):

    def __init__(self, in_channel, out_channel):
        super(SSH, self).__init__()
        assert out_channel % 4 == 0
        leaky = 0
        if (out_channel <= 64):
            leaky = 0.1
        self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1)

        self.conv5X5_1 = conv_bn(
            in_channel, out_channel // 4, stride=1, leaky=leaky)
        self.conv5X5_2 = conv_bn_no_relu(
            out_channel // 4, out_channel // 4, stride=1)

        self.conv7X7_2 = conv_bn(
            out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
        self.conv7x7_3 = conv_bn_no_relu(
            out_channel // 4, out_channel // 4, stride=1)

    def forward(self, input):
        conv3X3 = self.conv3X3(input)

        conv5X5_1 = self.conv5X5_1(input)
        conv5X5 = self.conv5X5_2(conv5X5_1)

        conv7X7_2 = self.conv7X7_2(conv5X5_1)
        conv7X7 = self.conv7x7_3(conv7X7_2)

        out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
        out = F.relu(out)
        return out


 class FPN(nn.Module):

    def __init__(self, in_channels_list, out_channels):
        super(FPN, self).__init__()
        leaky = 0
        if (out_channels <= 64):
            leaky = 0.1
        self.output1 = conv_bn1X1(
            in_channels_list[0], out_channels, stride=1, leaky=leaky)
        self.output2 = conv_bn1X1(
            in_channels_list[1], out_channels, stride=1, leaky=leaky)
        self.output3 = conv_bn1X1(
            in_channels_list[2], out_channels, stride=1, leaky=leaky)

        self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
        self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)

    def forward(self, input):
        # names = list(input.keys())
        input = list(input.values())

        output1 = self.output1(input[0])
        output2 = self.output2(input[1])
        output3 = self.output3(input[2])

        up3 = F.interpolate(
            output3, size=[output2.size(2), output2.size(3)], mode='nearest')
        output2 = output2 + up3
        output2 = self.merge2(output2)

        up2 = F.interpolate(
            output2, size=[output1.size(2), output1.size(3)], mode='nearest')
        output1 = output1 + up2
        output1 = self.merge1(output1)

        out = [output1, output2, output3]
        return out


 class MobileNetV1(nn.Module):

    def __init__(self):
        super(MobileNetV1, self).__init__()
        self.stage1 = nn.Sequential(
            conv_bn(3, 8, 2, leaky=0.1),  # 3
            conv_dw(8, 16, 1),  # 7
            conv_dw(16, 32, 2),  # 11
            conv_dw(32, 32, 1),  # 19
            conv_dw(32, 64, 2),  # 27
            conv_dw(64, 64, 1),  # 43
        )
        self.stage2 = nn.Sequential(
            conv_dw(64, 128, 2),  # 43 + 16 = 59
            conv_dw(128, 128, 1),  # 59 + 32 = 91
            conv_dw(128, 128, 1),  # 91 + 32 = 123
            conv_dw(128, 128, 1),  # 123 + 32 = 155
            conv_dw(128, 128, 1),  # 155 + 32 = 187
            conv_dw(128, 128, 1),  # 187 + 32 = 219
        )
        self.stage3 = nn.Sequential(
            conv_dw(128, 256, 2),  # 219 +3 2 = 241
            conv_dw(256, 256, 1),  # 241 + 64 = 301
        )
        self.avg = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, 1000)

    def forward(self, x):
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.avg(x)
        x = x.view(-1, 256)
        x = self.fc(x)
        return x
--- a/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
+++ b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
@@ -0,0 +1,145 @@
 # The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
 from collections import OrderedDict

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.models as models
 import torchvision.models._utils as _utils
 import torchvision.models.detection.backbone_utils as backbone_utils

 from .net import FPN, SSH, MobileNetV1


 class ClassHead(nn.Module):

    def __init__(self, inchannels=512, num_anchors=3):
        super(ClassHead, self).__init__()
        self.num_anchors = num_anchors
        self.conv1x1 = nn.Conv2d(
            inchannels,
            self.num_anchors * 2,
            kernel_size=(1, 1),
            stride=1,
            padding=0)

    def forward(self, x):
        out = self.conv1x1(x)
        out = out.permute(0, 2, 3, 1).contiguous()

        return out.view(out.shape[0], -1, 2)


 class BboxHead(nn.Module):

    def __init__(self, inchannels=512, num_anchors=3):
        super(BboxHead, self).__init__()
        self.conv1x1 = nn.Conv2d(
            inchannels,
            num_anchors * 4,
            kernel_size=(1, 1),
            stride=1,
            padding=0)

    def forward(self, x):
        out = self.conv1x1(x)
        out = out.permute(0, 2, 3, 1).contiguous()

        return out.view(out.shape[0], -1, 4)


 class LandmarkHead(nn.Module):

    def __init__(self, inchannels=512, num_anchors=3):
        super(LandmarkHead, self).__init__()
        self.conv1x1 = nn.Conv2d(
            inchannels,
            num_anchors * 10,
            kernel_size=(1, 1),
            stride=1,
            padding=0)

    def forward(self, x):
        out = self.conv1x1(x)
        out = out.permute(0, 2, 3, 1).contiguous()

        return out.view(out.shape[0], -1, 10)


 class RetinaFace(nn.Module):

    def __init__(self, cfg=None):
        """
        :param cfg:  Network related settings.
        """
        super(RetinaFace, self).__init__()
        backbone = None
        if cfg['name'] == 'Resnet50':
            backbone = models.resnet50(pretrained=cfg['pretrain'])
        else:
            raise Exception('Invalid name')

        self.body = _utils.IntermediateLayerGetter(backbone,
                                                   cfg['return_layers'])
        in_channels_stage2 = cfg['in_channel']
        in_channels_list = [
            in_channels_stage2 * 2,
            in_channels_stage2 * 4,
            in_channels_stage2 * 8,
        ]
        out_channels = cfg['out_channel']
        self.fpn = FPN(in_channels_list, out_channels)
        self.ssh1 = SSH(out_channels, out_channels)
        self.ssh2 = SSH(out_channels, out_channels)
        self.ssh3 = SSH(out_channels, out_channels)

        self.ClassHead = self._make_class_head(
            fpn_num=3, inchannels=cfg['out_channel'])
        self.BboxHead = self._make_bbox_head(
            fpn_num=3, inchannels=cfg['out_channel'])
        self.LandmarkHead = self._make_landmark_head(
            fpn_num=3, inchannels=cfg['out_channel'])

    def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
        classhead = nn.ModuleList()
        for i in range(fpn_num):
            classhead.append(ClassHead(inchannels, anchor_num))
        return classhead

    def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
        bboxhead = nn.ModuleList()
        for i in range(fpn_num):
            bboxhead.append(BboxHead(inchannels, anchor_num))
        return bboxhead

    def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
        landmarkhead = nn.ModuleList()
        for i in range(fpn_num):
            landmarkhead.append(LandmarkHead(inchannels, anchor_num))
        return landmarkhead

    def forward(self, inputs):
        out = self.body(inputs)

        # FPN
        fpn = self.fpn(out)

        # SSH
        feature1 = self.ssh1(fpn[0])
        feature2 = self.ssh2(fpn[1])
        feature3 = self.ssh3(fpn[2])
        features = [feature1, feature2, feature3]

        bbox_regressions = torch.cat(
            [self.BboxHead[i](feature) for i, feature in enumerate(features)],
            dim=1)
        classifications = torch.cat(
            [self.ClassHead[i](feature) for i, feature in enumerate(features)],
            dim=1)
        ldm_regressions = torch.cat(
            [self.LandmarkHead[i](feat) for i, feat in enumerate(features)],
            dim=1)

        output = (bbox_regressions, F.softmax(classifications,
                                              dim=-1), ldm_regressions)
        return output
--- a/modelscope/models/cv/face_detection/retinaface/utils.py
+++ b/modelscope/models/cv/face_detection/retinaface/utils.py
@@ -0,0 +1,123 @@
 # --------------------------------------------------------
 # Modified from https://github.com/biubug6/Pytorch_Retinaface
 # --------------------------------------------------------

 from itertools import product as product
 from math import ceil

 import numpy as np
 import torch


 class PriorBox(object):

    def __init__(self, cfg, image_size=None, phase='train'):
        super(PriorBox, self).__init__()
        self.min_sizes = cfg['min_sizes']
        self.steps = cfg['steps']
        self.clip = cfg['clip']
        self.image_size = image_size
        self.feature_maps = [[
            ceil(self.image_size[0] / step),
            ceil(self.image_size[1] / step)
        ] for step in self.steps]
        self.name = 's'

    def forward(self):
        anchors = []
        for k, f in enumerate(self.feature_maps):
            min_sizes = self.min_sizes[k]
            for i, j in product(range(f[0]), range(f[1])):
                for min_size in min_sizes:
                    s_kx = min_size / self.image_size[1]
                    s_ky = min_size / self.image_size[0]
                    dense_cx = [
                        x * self.steps[k] / self.image_size[1]
                        for x in [j + 0.5]
                    ]
                    dense_cy = [
                        y * self.steps[k] / self.image_size[0]
                        for y in [i + 0.5]
                    ]
                    for cy, cx in product(dense_cy, dense_cx):
                        anchors += [cx, cy, s_kx, s_ky]

        # back to torch land
        output = torch.Tensor(anchors).view(-1, 4)
        if self.clip:
            output.clamp_(max=1, min=0)
        return output


 def py_cpu_nms(dets, thresh):
    """Pure Python NMS baseline."""
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep


 # Adapted from https://github.com/Hakuyume/chainer-ssd
 def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat(
        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes


 def decode_landm(pre, priors, variances):
    """Decode landm from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        pre (tensor): landm predictions for loc layers,
            Shape: [num_priors,10]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded landm predictions
    """
    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
    landms = torch.cat((a, b, c, d, e), dim=1)
    return landms
--- a/modelscope/models/cv/facial_expression_recognition/init.py
+++ b/modelscope/models/cv/facial_expression_recognition/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .fer import FacialExpressionRecognition

 else:
    _import_structure = {'fer': ['FacialExpressionRecognition']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/facial_expression_recognition/fer/init.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .facial_expression_recognition import FacialExpressionRecognition
--- a/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
@@ -0,0 +1,72 @@
 # The implementation is based on Facial-Expression-Recognition, available at
 # https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
 import os

 import cv2
 import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 import torch.nn.functional as F
 from PIL import Image
 from torch.autograd import Variable

 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from . import transforms
 from .vgg import VGG


@MODELS.register_module(
    Tasks.facial_expression_recognition, module_name=Models.fer)
 class FacialExpressionRecognition(TorchModel):

    def __init__(self, model_path, device='cuda'):
        super().__init__(model_path)
        torch.set_grad_enabled(False)
        cudnn.benchmark = True
        self.model_path = model_path
        self.device = device
        self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
                                           ModelFile.CONFIGURATION)
        self.net = VGG('VGG19', cfg_path=self.cfg_path)
        self.load_model()
        self.net = self.net.to(device)
        self.transform_test = transforms.Compose([
            transforms.TenCrop(44),
            transforms.Lambda(lambda crops: torch.stack(
                [transforms.ToTensor()(crop) for crop in crops])),
        ])

        self.mean = np.array([[104, 117, 123]])

    def load_model(self, load_to_cpu=False):
        pretrained_dict = torch.load(
            self.model_path, map_location=torch.device('cpu'))
        self.net.load_state_dict(pretrained_dict['net'], strict=True)
        self.net.eval()

    def forward(self, input):
        img = input['img']
        img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY)
        img = cv2.resize(img, (48, 48))
        img = img[:, :, np.newaxis]
        img = np.concatenate((img, img, img), axis=2)

        img = Image.fromarray(np.uint8(img))
        inputs = self.transform_test(img)

        ncrops, c, h, w = inputs.shape

        inputs = inputs.view(-1, c, h, w)
        inputs = inputs.to(self.device)
        inputs = Variable(inputs, volatile=True)
        outputs = self.net(inputs)

        outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops

        score = F.softmax(outputs_avg)
        _, predicted = torch.max(outputs_avg.data, 0)

        return score, predicted
--- a/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
@@ -0,0 +1,118 @@
 # The implementation is based on Facial-Expression-Recognition, available at
 # https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
 import numbers
 import types

 import numpy as np
 import torch
 from PIL import Image


 def to_tensor(pic):

    # handle PIL Image
    if pic.mode == 'I':
        img = torch.from_numpy(np.array(pic, np.int32, copy=False))
    elif pic.mode == 'I;16':
        img = torch.from_numpy(np.array(pic, np.int16, copy=False))
    else:
        img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
    # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
    if pic.mode == 'YCbCr':
        nchannel = 3
    elif pic.mode == 'I;16':
        nchannel = 1
    else:
        nchannel = len(pic.mode)
    img = img.view(pic.size[1], pic.size[0], nchannel)
    # put it from HWC to CHW format
    # yikes, this transpose takes 80% of the loading time/CPU
    img = img.transpose(0, 1).transpose(0, 2).contiguous()
    if isinstance(img, torch.ByteTensor):
        return img.float().div(255)
    else:
        return img


 def center_crop(img, output_size):
    if isinstance(output_size, numbers.Number):
        output_size = (int(output_size), int(output_size))
    w, h = img.size
    th, tw = output_size
    i = int(round((h - th) / 2.))
    j = int(round((w - tw) / 2.))
    return img.crop((j, i, j + tw, i + th))


 def five_crop(img, size):
    if isinstance(size, numbers.Number):
        size = (int(size), int(size))
    else:
        assert len(
            size) == 2, 'Please provide only two dimensions (h, w) for size.'

    w, h = img.size
    crop_h, crop_w = size
    if crop_w > w or crop_h > h:
        raise ValueError(
            'Requested crop size {} is bigger than input size {}'.format(
                size, (h, w)))
    tl = img.crop((0, 0, crop_w, crop_h))
    tr = img.crop((w - crop_w, 0, w, crop_h))
    bl = img.crop((0, h - crop_h, crop_w, h))
    br = img.crop((w - crop_w, h - crop_h, w, h))
    center = center_crop(img, (crop_h, crop_w))
    return (tl, tr, bl, br, center)


 class TenCrop(object):

    def __init__(self, size, vertical_flip=False):
        self.size = size
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            assert len(
                size
            ) == 2, 'Please provide only two dimensions (h, w) for size.'
            self.size = size
        self.vertical_flip = vertical_flip

    def __call__(self, img):
        first_five = five_crop(img, self.size)

        if self.vertical_flip:
            img = img.transpose(Image.FLIP_TOP_BOTTOM)
        else:
            img = img.transpose(Image.FLIP_LEFT_RIGHT)

        second_five = five_crop(img, self.size)

        return first_five + second_five


 class Compose(object):

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img):
        for t in self.transforms:
            img = t(img)
        return img


 class ToTensor(object):

    def __call__(self, pic):
        return to_tensor(pic)


 class Lambda(object):

    def __init__(self, lambd):
        assert isinstance(lambd, types.LambdaType)
        self.lambd = lambd

    def __call__(self, img):
        return self.lambd(img)
--- a/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
@@ -0,0 +1,40 @@
 # The implementation is based on Facial-Expression-Recognition, available at
 # https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable

 from modelscope.utils.config import Config


 class VGG(nn.Module):

    def __init__(self, vgg_name, cfg_path):
        super(VGG, self).__init__()
        model_cfg = Config.from_file(cfg_path)['models']
        self.features = self._make_layers(model_cfg[vgg_name])
        self.classifier = nn.Linear(512, 7)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = F.dropout(out, p=0.5, training=self.training)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [
                    nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                    nn.BatchNorm2d(x),
                    nn.ReLU(inplace=True)
                ]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)
--- a/modelscope/models/cv/shop_segmentation/init.py
+++ b/modelscope/models/cv/shop_segmentation/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .shop_seg_base import SHOPSEG

 else:
    _import_structure = {'shop_seg_base': ['SHOPSEG']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/shop_segmentation/common.py
+++ b/modelscope/models/cv/shop_segmentation/common.py
@@ -0,0 +1,59 @@
 """
 Base modules are adapted from https://github.com/open-mmlab/mmcv/,
 originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
 https://github.com/open-mmlab/mmsegmentation/,
 originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
 and adapted from https://github.com/raoyongming/DenseCLIP/,
 originally MIT License, Copyright (c) 2022 Rao, Yongming.
 """

 import warnings

 import torch.nn as nn
 import torch.nn.functional as F


 def resize(input,
           size=None,
           scale_factor=None,
           mode='nearest',
           align_corners=None,
           warning=True):
    if warning:
        if size is not None and align_corners:
            input_h, input_w = tuple(int(x) for x in input.shape[2:])
            output_h, output_w = tuple(int(x) for x in size)
            if output_h > input_h or output_w > input_w:
                if ((output_h > 1 and output_w > 1 and input_h > 1
                     and input_w > 1) and (output_h - 1) % (input_h - 1)
                        and (output_w - 1) % (input_w - 1)):
                    warnings.warn(
                        f'When align_corners={align_corners}, '
                        'the output would more aligned if '
                        f'input size {(input_h, input_w)} is `x+1` and '
                        f'out size {(output_h, output_w)} is `nx+1`')
    return F.interpolate(input, size, scale_factor, mode, align_corners)


 class Upsample(nn.Module):

    def __init__(self,
                 size=None,
                 scale_factor=None,
                 mode='nearest',
                 align_corners=None):
        super(Upsample, self).__init__()
        self.size = size
        if isinstance(scale_factor, tuple):
            self.scale_factor = tuple(float(factor) for factor in scale_factor)
        else:
            self.scale_factor = float(scale_factor) if scale_factor else None
        self.mode = mode
        self.align_corners = align_corners

    def forward(self, x):
        if not self.size:
            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
        else:
            size = self.size
        return resize(x, size, None, self.mode, self.align_corners)
--- a/modelscope/models/cv/shop_segmentation/head_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -0,0 +1,122 @@
 """ FPNHead
 Base modules are adapted from https://github.com/open-mmlab/mmcv/,
 originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
 https://github.com/open-mmlab/mmsegmentation/,
 originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
 and adapted from https://github.com/raoyongming/DenseCLIP/,
 originally MIT License, Copyright (c) 2022 Rao, Yongming.
 """

 import numpy as np
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 from timm.models.layers import drop, drop_path, trunc_normal_

 from .common import Upsample, resize


 class FPNHead(nn.Module):
    """Panoptic Feature Pyramid Networks.
    This head is the implementation of `Semantic FPN
    <https://arxiv.org/abs/1901.02446>`_.
    Args:
        feature_strides (tuple[int]): The strides for input feature maps.
            stack_lateral. All strides suppose to be power of 2. The first
            one is of largest resolution.
    """

    def __init__(self,
                 channels,
                 num_classes,
                 dropout_ratio=0.1,
                 feature_strides=[4, 8, 16, 32],
                 align_corners=False,
                 **kwargs):
        super(FPNHead, self).__init__()
        self.act_cfg = dict(type='ReLU')
        self.channels = channels
        self.conv_cfg = None
        self.norm_cfg = None
        self.norm_cfg = dict(type='BN2d', requires_grad=True)
        self.align_corners = align_corners
        self.dropout_ratio = dropout_ratio
        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
        if dropout_ratio > 0:
            self.dropout = nn.Dropout2d(dropout_ratio)
        else:
            self.dropout = None
        self.in_index = [0, 1, 2, 3]
        assert min(feature_strides) == feature_strides[0]
        self.feature_strides = feature_strides
        self.scale_heads = nn.ModuleList()
        for i in range(len(feature_strides)):
            head_length = max(
                1,
                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
            scale_head = []
            for k in range(head_length):
                scale_head.append(
                    ConvModule(
                        self.channels,
                        self.channels,
                        3,
                        padding=1,
                        conv_cfg=self.conv_cfg,
                        norm_cfg=self.norm_cfg,
                        act_cfg=self.act_cfg))
                if feature_strides[i] != feature_strides[0]:
                    scale_head.append(
                        Upsample(
                            scale_factor=2,
                            mode='bilinear',
                            align_corners=self.align_corners))
            self.scale_heads.append(nn.Sequential(*scale_head))

        self.apply(self._init_weights)

    def _transform_inputs(self, inputs):
        """Transform inputs for decoder.

        Args:
            inputs (list[Tensor]): List of multi-level img features.

        Returns:
            Tensor: The transformed inputs
        """
        inputs = [inputs[i] for i in self.in_index]
        return inputs

    def cls_seg(self, feat):
        """Classify each pixel."""
        if self.dropout is not None:
            feat = self.dropout(feat)
        output = self.conv_seg(feat)
        return output

    def forward(self, inputs):
        x = self._transform_inputs(inputs)
        output = self.scale_heads[0](x[0])
        for i in range(1, len(self.feature_strides)):
            # non inplace
            output = output + resize(
                self.scale_heads[i](x[i]),
                size=output.shape[2:],
                mode='bilinear',
                align_corners=self.align_corners)

        output = self.cls_seg(output)
        return output

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias.data, 0)
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -0,0 +1,901 @@
 """
 Base modules are adapted from https://github.com/open-mmlab/mmcv/,
 originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
 https://github.com/open-mmlab/mmsegmentation/,
 originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
 and adapted from https://github.com/raoyongming/DenseCLIP/,
 originally MIT License, Copyright (c) 2022 Rao, Yongming.
 """

 import math
 from collections import OrderedDict

 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from timm.models.layers import drop, drop_path, trunc_normal_
 from torch import nn


 class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1):
        super().__init__()

        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)

        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = None
        self.stride = stride

        if stride > 1 or inplanes != planes * Bottleneck.expansion:
            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
            self.downsample = nn.Sequential(
                OrderedDict([('-1', nn.AvgPool2d(stride)),
                             ('0',
                              nn.Conv2d(
                                  inplanes,
                                  planes * self.expansion,
                                  1,
                                  stride=1,
                                  bias=False)),
                             ('1', nn.BatchNorm2d(planes * self.expansion))]))

    def forward(self, x: torch.Tensor):
        identity = x

        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.avgpool(out)
        out = self.bn3(self.conv3(out))

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return out


 class AttentionPool2d(nn.Module):

    def __init__(self,
                 spacial_dim: int,
                 embed_dim: int,
                 num_heads: int,
                 output_dim: int = None):
        super().__init__()
        self.positional_embedding = nn.Parameter(
            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.spacial_dim = spacial_dim

    def forward(self, x):
        B, C, H, W = x.shape
        x = x.reshape(x.shape[0], x.shape[1],
                      x.shape[2] * x.shape[3]).permute(2, 0,
                                                       1)  # NCHW -> (HW)NC
        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC

        cls_pos = self.positional_embedding[0:1, :]
        spatial_pos = F.interpolate(
            self.positional_embedding[1:, ].reshape(1, self.spacial_dim,
                                                    self.spacial_dim,
                                                    self.embed_dim).permute(
                                                        0, 3, 1, 2),
            size=(H, W),
            mode='bilinear')
        spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0)
        positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)

        x = x + positional_embedding[:, None, :]
        x, _ = F.multi_head_attention_forward(
            query=x,
            key=x,
            value=x,
            embed_dim_to_check=x.shape[-1],
            num_heads=self.num_heads,
            q_proj_weight=self.q_proj.weight,
            k_proj_weight=self.k_proj.weight,
            v_proj_weight=self.v_proj.weight,
            in_proj_weight=None,
            in_proj_bias=torch.cat(
                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
            bias_k=None,
            bias_v=None,
            add_zero_attn=False,
            dropout_p=0,
            out_proj_weight=self.c_proj.weight,
            out_proj_bias=self.c_proj.bias,
            use_separate_proj_weight=True,
            training=self.training,
            need_weights=False)

        x = x.permute(1, 2, 0)
        global_feat = x[:, :, 0]
        feature_map = x[:, :, 1:].reshape(B, -1, H, W)
        return global_feat, feature_map


 class CLIPResNet(nn.Module):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self,
                 layers,
                 output_dim=512,
                 input_resolution=224,
                 width=64,
                 pretrained=None,
                 **kwargs):
        super().__init__()
        self.pretrained = pretrained
        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv2d(
            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width // 2)
        self.conv2 = nn.Conv2d(
            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(width // 2)
        self.conv3 = nn.Conv2d(
            width // 2, width, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(width)
        self.avgpool = nn.AvgPool2d(2)
        self.relu = nn.ReLU(inplace=True)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

    def init_weights(self, pretrained=None):
        pretrained = pretrained or self.pretrained
        if isinstance(pretrained, str):
            checkpoint = torch.jit.load(
                pretrained, map_location='cpu').float().state_dict()

            state_dict = {}

            for k in checkpoint.keys():
                if k.startswith('visual.'):
                    new_k = k.replace('visual.', '')
                    state_dict[new_k] = checkpoint[k]

            u, w = self.load_state_dict(state_dict, False)
            print(u, w, 'are misaligned params in CLIPResNet')

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):

        def stem(x):
            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
                             (self.conv3, self.bn3)]:
                x = self.relu(bn(conv(x)))
            x = self.avgpool(x)
            return x

        x = x.type(self.conv1.weight.dtype)
        x = stem(x)

        outs = []
        x = self.layer1(x)
        outs.append(x)
        x = self.layer2(x)
        outs.append(x)
        x = self.layer3(x)
        outs.append(x)
        x = self.layer4(x)
        outs.append(x)

        return tuple(outs)


 class CLIPResNetWithAttention(nn.Module):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self,
                 layers,
                 output_dim=1024,
                 input_resolution=224,
                 width=64,
                 pretrained=None,
                 **kwargs):
        super().__init__()
        self.pretrained = pretrained
        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv2d(
            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width // 2)
        self.conv2 = nn.Conv2d(
            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(width // 2)
        self.conv3 = nn.Conv2d(
            width // 2, width, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(width)
        self.avgpool = nn.AvgPool2d(2)
        self.relu = nn.ReLU(inplace=True)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

        embed_dim = width * 32  # the ResNet feature dimension
        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32,
                                        output_dim)

    def init_weights(self, pretrained=None):
        pretrained = pretrained or self.pretrained
        if isinstance(pretrained, str):
            checkpoint = torch.jit.load(
                pretrained, map_location='cpu').float().state_dict()

            state_dict = {}

            for k in checkpoint.keys():
                if k.startswith('visual.'):
                    new_k = k.replace('visual.', '')
                    state_dict[new_k] = checkpoint[k]

                    if 'positional_embedding' in new_k:
                        if self.attnpool.positional_embedding.shape != state_dict[
                                new_k].shape:
                            print(
                                f'Resize the pos_embed shape from {state_dict[new_k].shape}'
                                f' to {self.attnpool.positional_embedding.shape}'
                            )
                            cls_pos = state_dict[new_k][0:1, :]
                            H = W = self.input_resolution // 32
                            old_h = int(
                                math.sqrt(state_dict[new_k][1:, ].shape[0]))
                            spatial_pos = F.interpolate(
                                state_dict[new_k][1:, ].reshape(
                                    1, old_h, old_h,
                                    cls_pos.shape[1]).permute(0, 3, 1, 2),
                                size=(H, W),
                                mode='bilinear')
                            spatial_pos = spatial_pos.reshape(
                                cls_pos.shape[1], H * W).permute(1, 0)
                            positional_embedding = torch.cat(
                                [cls_pos, spatial_pos], dim=0)
                            state_dict[new_k] = positional_embedding
                            assert self.attnpool.positional_embedding.shape == state_dict[
                                new_k].shape

            u, w = self.load_state_dict(state_dict, False)
            print(u, w, 'are misaligned params in CLIPResNet')

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):

        def stem(x):
            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
                             (self.conv3, self.bn3)]:
                x = self.relu(bn(conv(x)))
            x = self.avgpool(x)
            return x

        x = x.type(self.conv1.weight.dtype)
        x = stem(x)

        outs = []
        x = self.layer1(x)
        outs.append(x)
        x = self.layer2(x)
        outs.append(x)
        x = self.layer3(x)
        outs.append(x)
        x = self.layer4(x)
        outs.append(x)

        x_global, x_local = self.attnpool(x)
        outs.append([x_global, x_local])

        return tuple(outs)


 class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


 class QuickGELU(nn.Module):

    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return 'p={}'.format(self.drop_prob)


 class ResidualAttentionBlock(nn.Module):

    def __init__(self,
                 d_model: int,
                 n_head: int,
                 attn_mask: torch.Tensor = None,
                 drop_path=0.):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(
            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
                         ('gelu', QuickGELU()),
                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(
            dtype=x.dtype,
            device=x.device) if self.attn_mask is not None else None
        return self.attn(
            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.drop_path(self.attention(self.ln_1(x)))
        x = x + self.drop_path(self.mlp(self.ln_2(x)))
        return x


 class Transformer(nn.Module):

    def __init__(self,
                 width: int,
                 layers: int,
                 heads: int,
                 attn_mask: torch.Tensor = None,
                 drop_path_rate=0.):
        super().__init__()
        self.width = width
        self.layers = layers
        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)
               ]  # stochastic depth decay rule
        self.resblocks = nn.Sequential(*[
            ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
            for i in range(layers)
        ])

    def forward(self, x: torch.Tensor):
        return self.resblocks(x)


 class Attention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 qk_scale=None,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
        self.scale = qk_scale or head_dim**-0.5

        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, q, k, v):
        B, N, C = q.shape
        assert k.shape == v.shape
        B, M, C = k.shape
        q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
        k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
        v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)

        attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale

        attn = attn.softmax(dim=-1)

        x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)

        x = self.proj(x)
        x = self.proj_drop(x)
        return x


 class TransformerDecoderLayer(nn.Module):

    def __init__(
        self,
        d_model,
        nhead,
        dropout=0.1,
    ):
        super().__init__()
        self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
        self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

        self.mlp = nn.Sequential(
            nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model))

    def forward(self, x, mem):
        q = k = v = self.norm1(x)
        x = x + self.self_attn(q, k, v)
        q = self.norm2(x)
        x = x + self.cross_attn(q, mem, mem)
        x = x + self.dropout(self.mlp(self.norm3(x)))
        return x


 class CLIPVisionTransformer(nn.Module):

    def __init__(self,
                 input_resolution=224,
                 patch_size=32,
                 width=768,
                 layers=12,
                 heads=12,
                 output_dim=512,
                 drop_path_rate=0.0,
                 out_indices=[3, 5, 7, 11],
                 pretrained=None,
                 get_embeddings=False,
                 **kwargs):
        super().__init__()
        self.pretrained = pretrained
        self.input_resolution = input_resolution
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=width,
            kernel_size=patch_size,
            stride=patch_size,
            bias=False)

        scale = width**-0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn(
            (input_resolution // patch_size)**2 + 1, width))
        self.spatial_size = input_resolution // patch_size
        self.ln_pre = LayerNorm(width)
        self.get_embeddings = get_embeddings

        self.transformer = Transformer(
            width, layers, heads, drop_path_rate=drop_path_rate)

        self.out_indices = out_indices

        if get_embeddings:
            self.ln_post = LayerNorm(width)
            self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

        embed_dim = width

        if patch_size == 16:
            self.fpn1 = nn.Sequential(
                nn.GroupNorm(1, embed_dim),
                nn.ConvTranspose2d(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
                nn.SyncBatchNorm(embed_dim),
                nn.GELU(),
                nn.ConvTranspose2d(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
            )

            self.fpn2 = nn.Sequential(
                nn.GroupNorm(1, embed_dim),
                nn.ConvTranspose2d(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
            )

            self.fpn3 = nn.GroupNorm(1, embed_dim)

            self.fpn4 = nn.Sequential(
                nn.GroupNorm(1, embed_dim),
                nn.MaxPool2d(kernel_size=2, stride=2))

        elif patch_size == 8:
            self.fpn1 = nn.Sequential(
                nn.GroupNorm(1, embed_dim),
                nn.ConvTranspose2d(
                    embed_dim, embed_dim, kernel_size=2, stride=2),
            )

            self.fpn2 = nn.GroupNorm(1, embed_dim)

            self.fpn3 = nn.Sequential(
                nn.GroupNorm(1, embed_dim),
                nn.MaxPool2d(kernel_size=2, stride=2),
            )

            self.fpn4 = nn.Sequential(
                nn.GroupNorm(1, embed_dim),
                nn.MaxPool2d(kernel_size=4, stride=4),
            )

    def init_weights(self, pretrained=None):
        pretrained = pretrained or self.pretrained
        if isinstance(pretrained, str):
            checkpoint = torch.jit.load(
                pretrained, map_location='cpu').float().state_dict()

            state_dict = {}

            for k in checkpoint.keys():
                if k.startswith('visual.'):
                    new_k = k.replace('visual.', '')
                    state_dict[new_k] = checkpoint[k]

            if 'positional_embedding' in state_dict.keys():
                if self.positional_embedding.shape != state_dict[
                        'positional_embedding'].shape:
                    print(
                        f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to'
                        f' {self.positional_embedding.shape}')
                    cls_pos = state_dict['positional_embedding'][0:1, :]
                    spatial_pos = F.interpolate(
                        state_dict['positional_embedding'][1:, ].reshape(
                            1, 14, 14, 768).permute(0, 3, 1, 2),
                        size=(self.spatial_size, self.spatial_size),
                        mode='bilinear')
                    spatial_pos = spatial_pos.reshape(
                        768,
                        self.spatial_size * self.spatial_size).permute(1, 0)
                    positional_embedding = torch.cat([cls_pos, spatial_pos],
                                                     dim=0)
                    state_dict['positional_embedding'] = positional_embedding
                    assert self.positional_embedding.shape == state_dict[
                        'positional_embedding'].shape

            u, w = self.load_state_dict(state_dict, False)
            print(u, w, 'are misaligned params in vision transformer')

    def forward(self, x: torch.Tensor):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        B, C, H, W = x.shape
        x = x.reshape(x.shape[0], x.shape[1],
                      -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x1 = self.class_embedding.to(x.dtype)
        x2 = torch.zeros(
            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
        x = torch.cat([x1 + x2, x], dim=1)
        pos = self.positional_embedding.to(x.dtype)
        cls_pos = pos[0, :] + self.class_embedding.to(x.dtype)
        spatial_pos = F.interpolate(
            pos[1:, ].reshape(1, self.spatial_size, self.spatial_size,
                              C).permute(0, 3, 1, 2),
            size=(H, W),
            mode='bilinear')
        spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1)
        pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
        x = x + pos
        x = self.ln_pre(x)
        x = x.permute(1, 0, 2)  # NLD -> LND

        gradientcheckpoint = False

        features = []
        for i, blk in enumerate(self.transformer.resblocks):
            if gradientcheckpoint:
                x = checkpoint.checkpoint(blk, x)
            else:
                x = blk(x)

            if i in self.out_indices:
                xp = x.permute(1, 0, 2)[:,
                                        1:, :].permute(0, 2,
                                                       1).reshape(B, -1, H, W)
                features.append(xp.contiguous())

        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
        for i in range(len(features)):
            features[i] = ops[i](features[i])

        if self.get_embeddings:
            x = x.permute(1, 0, 2)
            x = self.ln_post(x)
            x = x @ self.proj

            global_embedding = x[:, 0]
            visual_embedding = x[:, 1:].reshape(B, H, W,
                                                -1).permute(0, 3, 1,
                                                            2)  # B C H W

            features.append([global_embedding, visual_embedding])

        return tuple(features)


 class CLIPTextEncoder(nn.Module):

    def __init__(self,
                 context_length=77,
                 vocab_size=49408,
                 transformer_width=512,
                 transformer_heads=8,
                 transformer_layers=12,
                 embed_dim=1024,
                 out_dim=256,
                 pretrained=None,
                 **kwargs):
        super().__init__()

        self.pretrained = pretrained

        self.context_length = context_length

        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            attn_mask=self.build_attention_mask())

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = nn.Parameter(
            torch.empty(self.context_length, transformer_width))
        self.ln_final = LayerNorm(transformer_width)
        self.text_projection = nn.Parameter(
            torch.empty(transformer_width, embed_dim))

    def init_weights(self, pretrained=None):
        pretrained = pretrained or self.pretrained
        if isinstance(pretrained, str):
            checkpoint = torch.jit.load(
                pretrained, map_location='cpu').float().state_dict()

            state_dict = {}

            for k in checkpoint.keys():
                if k.startswith('transformer.'):
                    state_dict[k] = checkpoint[k]

                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
                        'token_embedding') or k.startswith('ln_final'):
                    if k == 'positional_embedding' and checkpoint[k].size(
                            0) > self.context_length:
                        checkpoint[k] = checkpoint[k][:self.context_length]
                        print('positional_embedding is tuncated from 77 to',
                              self.context_length)
                    state_dict[k] = checkpoint[k]

            u, w = self.load_state_dict(state_dict, False)
            print(u, w, 'are misaligned params in text encoder')

    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float('-inf'))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    def forward(self, text):
        x = self.token_embedding(text)
        x = x + self.positional_embedding
        x = x.permute(1, 0, 2)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        x = self.ln_final(x)
        x = x[torch.arange(x.shape[0]),
              text.argmax(dim=-1), ...] @ self.text_projection
        return x


 class CLIPTextContextEncoder(nn.Module):

    def __init__(self,
                 context_length=22,
                 vocab_size=49408,
                 transformer_width=512,
                 transformer_heads=8,
                 transformer_layers=12,
                 embed_dim=1024,
                 out_dim=256,
                 pretrained=None,
                 **kwargs):
        super().__init__()

        self.pretrained = pretrained

        self.context_length = context_length

        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            attn_mask=self.build_attention_mask())

        self.embed_dim = embed_dim

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = nn.Parameter(
            torch.empty(self.context_length, transformer_width))
        self.ln_final = LayerNorm(transformer_width)
        self.text_projection = nn.Parameter(
            torch.empty(transformer_width, embed_dim))

    def init_weights(self, pretrained=None):
        pretrained = pretrained or self.pretrained
        if isinstance(pretrained, str):
            checkpoint = torch.jit.load(
                pretrained, map_location='cpu').float().state_dict()

            state_dict = {}

            for k in checkpoint.keys():
                if k.startswith('transformer.'):
                    state_dict[k] = checkpoint[k]

                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
                        'token_embedding') or k.startswith('ln_final'):
                    if k == 'positional_embedding' and checkpoint[k].size(
                            0) > self.context_length:
                        checkpoint[k] = checkpoint[k][:self.context_length]
                        print('positional_embedding is tuncated from 77 to',
                              self.context_length)
                    state_dict[k] = checkpoint[k]

            u, w = self.load_state_dict(state_dict, False)
            print(u, w, 'are misaligned params in text encoder')

    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float('-inf'))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    def forward(self, text, context=None):
        x_text = self.token_embedding(text)  # n_clas, n_text, C
        K, N1, C = x_text.shape  # 150类 * 5??? * 512
        B, N2, C = context.shape  # 1 * 8 * 512

        eos_indx = text.argmax(dim=-1) + N2
        eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1)

        x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C)
        context = context.reshape(B, 1, N2, C).expand(B, K, N2, C)

        x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]],
                      dim=2).reshape(B * K, N1 + N2, C)
        x = x + self.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x)
        x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection
        x = x.reshape(B, K, self.embed_dim)
        return x


 class ContextDecoder(nn.Module):

    def __init__(self,
                 transformer_width=256,
                 transformer_heads=4,
                 transformer_layers=6,
                 visual_dim=1024,
                 dropout=0.1,
                 **kwargs):
        super().__init__()

        self.memory_proj = nn.Sequential(
            nn.LayerNorm(visual_dim),
            nn.Linear(visual_dim, transformer_width),
            nn.LayerNorm(transformer_width),
        )

        self.text_proj = nn.Sequential(
            nn.LayerNorm(visual_dim),
            nn.Linear(visual_dim, transformer_width),
        )

        self.decoder = nn.ModuleList([
            TransformerDecoderLayer(transformer_width, transformer_heads,
                                    dropout) for _ in range(transformer_layers)
        ])

        self.out_proj = nn.Sequential(
            nn.LayerNorm(transformer_width),
            nn.Linear(transformer_width, visual_dim))

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, text, visual):
        B, N, C = visual.shape
        visual = self.memory_proj(visual)
        x = self.text_proj(text)

        for layer in self.decoder:
            x = layer(x, visual)

        return self.out_proj(x)
--- a/modelscope/models/cv/shop_segmentation/neck_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -0,0 +1,217 @@
 """ FPNneck
 Base modules are adapted from https://github.com/open-mmlab/mmcv/,
 originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
 https://github.com/open-mmlab/mmsegmentation/,
 originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
 and adapted from https://github.com/raoyongming/DenseCLIP/,
 originally MIT License, Copyright (c) 2022 Rao, Yongming.
 """

 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 from timm.models.layers import drop, drop_path, trunc_normal_

 from .common import resize


 class FPN(nn.Module):
    """Feature Pyramid Network.

    This neck is the implementation of `Feature Pyramid Networks for Object
    Detection <https://arxiv.org/abs/1612.03144>`_.

    Args:
        in_channels (list[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale).
        num_outs (int): Number of output scales.
        start_level (int): Index of the start input backbone level used to
            build the feature pyramid. Default: 0.
        end_level (int): Index of the end input backbone level (exclusive) to
            build the feature pyramid. Default: -1, which means the last level.
        add_extra_convs (bool | str): If bool, it decides whether to add conv
            layers on top of the original feature maps. Default to False.
            If True, its actual mode is specified by `extra_convs_on_inputs`.
            If str, it specifies the source feature map of the extra convs.
            Only the following options are allowed

            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
            - 'on_lateral': Last feature map after lateral convs.
            - 'on_output': The last output feature map after fpn convs.
        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
            on the original feature from the backbone. If True,
            it is equivalent to `add_extra_convs='on_input'`. If False, it is
            equivalent to set `add_extra_convs='on_output'`. Default to True.
        relu_before_extra_convs (bool): Whether to apply relu before the extra
            conv. Default: False.
        no_norm_on_lateral (bool): Whether to apply norm on lateral.
            Default: False.
        conv_cfg (dict): Config dict for convolution layer. Default: None.
        norm_cfg (dict): Config dict for normalization layer. Default: None.
        act_cfg (dict): Config dict for activation layer in ConvModule.
            Default: None.
        upsample_cfg (dict): Config dict for interpolate layer.
            Default: dict(mode='nearest').
        init_cfg (dict or list[dict], optional): Initialization config dict.

    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 num_outs,
                 start_level=0,
                 end_level=-1,
                 add_extra_convs=False,
                 extra_convs_on_inputs=False,
                 relu_before_extra_convs=False,
                 no_norm_on_lateral=False,
                 conv_cfg=None,
                 norm_cfg=None,
                 act_cfg=None,
                 upsample_cfg=dict(mode='nearest')):
        super(FPN, self).__init__()
        assert isinstance(in_channels, list)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_ins = len(in_channels)
        self.num_outs = num_outs
        self.relu_before_extra_convs = relu_before_extra_convs
        self.no_norm_on_lateral = no_norm_on_lateral
        self.fp16_enabled = False
        self.upsample_cfg = upsample_cfg.copy()

        if end_level == -1:
            self.backbone_end_level = self.num_ins
            assert num_outs >= self.num_ins - start_level
        else:
            # if end_level < inputs, no extra level is allowed
            self.backbone_end_level = end_level
            assert end_level <= len(in_channels)
            assert num_outs == end_level - start_level
        self.start_level = start_level
        self.end_level = end_level
        self.add_extra_convs = add_extra_convs
        assert isinstance(add_extra_convs, (str, bool))
        if isinstance(add_extra_convs, str):
            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
        elif add_extra_convs:  # True
            if extra_convs_on_inputs:
                # For compatibility with previous release
                # TODO: deprecate `extra_convs_on_inputs`
                self.add_extra_convs = 'on_input'
            else:
                self.add_extra_convs = 'on_output'

        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()

        for i in range(self.start_level, self.backbone_end_level):
            l_conv = ConvModule(
                in_channels[i],
                out_channels,
                1,
                conv_cfg=conv_cfg,
                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
                act_cfg=act_cfg,
                inplace=False)
            fpn_conv = ConvModule(
                out_channels,
                out_channels,
                3,
                padding=1,
                conv_cfg=conv_cfg,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg,
                inplace=False)

            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

        # add extra conv layers (e.g., RetinaNet)
        extra_levels = num_outs - self.backbone_end_level + self.start_level
        if self.add_extra_convs and extra_levels >= 1:
            for i in range(extra_levels):
                if i == 0 and self.add_extra_convs == 'on_input':
                    in_channels = self.in_channels[self.backbone_end_level - 1]
                else:
                    in_channels = out_channels
                extra_fpn_conv = ConvModule(
                    in_channels,
                    out_channels,
                    3,
                    stride=2,
                    padding=1,
                    conv_cfg=conv_cfg,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg,
                    inplace=False)
                self.fpn_convs.append(extra_fpn_conv)

        self.apply(self._init_weights)

    def forward(self, inputs):
        assert len(inputs) == len(self.in_channels)

        # build laterals
        laterals = [
            lateral_conv(inputs[i + self.start_level])
            for i, lateral_conv in enumerate(self.lateral_convs)
        ]

        # build top-down path
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
            #  it cannot co-exist with `size` in `F.interpolate`.
            if 'scale_factor' in self.upsample_cfg:
                laterals[i - 1] = laterals[i - 1] + resize(
                    laterals[i], **self.upsample_cfg)
            else:
                prev_shape = laterals[i - 1].shape[2:]
                laterals[i - 1] = laterals[i - 1] + resize(
                    laterals[i], size=prev_shape, **self.upsample_cfg)

        # build outputs
        # part 1: from original levels
        outs = [
            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
        ]
        # part 2: add extra levels
        if self.num_outs > len(outs):
            # use max pool to get more levels on top of outputs
            # (e.g., Faster R-CNN, Mask R-CNN)
            if not self.add_extra_convs:
                for i in range(self.num_outs - used_backbone_levels):
                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
            # add conv layers on top of original feature maps (RetinaNet)
            else:
                if self.add_extra_convs == 'on_input':
                    extra_source = inputs[self.backbone_end_level - 1]
                elif self.add_extra_convs == 'on_lateral':
                    extra_source = laterals[-1]
                elif self.add_extra_convs == 'on_output':
                    extra_source = outs[-1]
                else:
                    raise NotImplementedError
                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
                for i in range(used_backbone_levels + 1, self.num_outs):
                    if self.relu_before_extra_convs:
                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
                    else:
                        outs.append(self.fpn_convs[i](outs[-1]))
        return tuple(outs)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)
        elif isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias.data, 0)
--- a/modelscope/models/cv/shop_segmentation/shop_seg_base.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
@@ -0,0 +1,157 @@
 """
 Base modules are adapted from https://github.com/open-mmlab/mmcv/,
 originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
 https://github.com/open-mmlab/mmsegmentation/,
 originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
 and adapted from https://github.com/raoyongming/DenseCLIP/,
 originally MIT License, Copyright (c) 2022 Rao, Yongming.
 """

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .head_fpn import FPNHead
 from .models import (CLIPTextContextEncoder, CLIPVisionTransformer,
                     ContextDecoder)
 from .neck_fpn import FPN
 from .utils import SimpleTokenizer, tokenize


 class SHOPSEG(nn.Module):
    """Encoder Decoder segmentors.

    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
    Note that auxiliary_head is only used for deep supervision during training,
    which could be dumped during inference.
    """

    def __init__(self,
                 model_dir,
                 context_length=22,
                 context_feature='attention',
                 score_concat_index=2,
                 tau=0.07,
                 token_embed_dim=512,
                 text_dim=512,
                 **args):
        super(SHOPSEG, self).__init__()

        self.model_dir = model_dir
        self.tokenizer = SimpleTokenizer(model_dir
                                         + '/bpe_simple_vocab_16e6.txt.gz')

        backbone = CLIPVisionTransformer(
            input_resolution=1024,
            patch_size=16,
            width=768,
            layers=12,
            output_dim=512,
            drop_path_rate=0.1,
            pretrained=False,
            get_embeddings=True)

        text_encoder = CLIPTextContextEncoder(
            context_length=30,
            vocab_size=49408,
            transformer_width=512,
            transformer_heads=8,
            transformer_layers=12,
            embed_dim=512,
            pretrained=False)

        context_decoder = ContextDecoder(
            transformer_width=256,
            transformer_heads=4,
            transformer_layers=3,
            visual_dim=512,
            dropout=0.1)
        neck = FPN(
            in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4)
        head_fpd = FPNHead(channels=256, num_classes=2)

        self.backbone = backbone
        self.text_encoder = text_encoder
        self.context_decoder = context_decoder
        self.context_length = context_length
        self.score_concat_index = score_concat_index

        self.context_feature = context_feature
        self.tau = tau
        context_length = self.text_encoder.context_length - self.context_length
        self.contexts = nn.Parameter(
            torch.randn(1, context_length, token_embed_dim))
        nn.init.trunc_normal_(self.contexts)
        self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)

        self.neck = neck
        self.head_fpn = head_fpd

        self.tau = 0.07

    def encode_text(self, text, context_length):
        output = tokenize(self.tokenizer, text, context_length, True)
        return output

    def extract_feat(self, img):
        """Extract features from images."""
        x = self.backbone(img)
        return x

    def after_extract_feat(self, x, name_list):
        x_orig = list(x[0:4])
        global_feat, visual_embeddings = x[4]
        B, C, H, W = visual_embeddings.shape
        if self.context_feature == 'attention':
            x1 = global_feat.reshape(B, C, 1)
            x2 = visual_embeddings.reshape(B, C, H * W)
            visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1)
        texts = torch.cat([
            self.encode_text(c, context_length=self.context_length)
            for c in name_list
        ])
        x1 = texts.to(global_feat.device)
        x1 = self.text_encoder(x1, self.contexts)
        text_embeddings = x1.expand(B, -1, -1)
        # update text_embeddings by visual_context!
        # (B, 1, C)
        text_diff = self.context_decoder(text_embeddings, visual_context)
        # (B, K, C)
        text_embeddings = text_embeddings + self.gamma * text_diff

        # compute score map and concat
        B, K, C = text_embeddings.shape
        visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2)
        text = F.normalize(text_embeddings, dim=2, p=2)
        score_map_list = []
        bsz = B
        for i in range(bsz):
            ind = 2 * i
            sub_text = torch.cat(
                [text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]],
                dim=1)  # 1 * 2 * h * w

            sub_score_map = torch.einsum('bchw,bkc->bkhw',
                                         visual_embeddings[i:i + 1],
                                         sub_text)  # 1 * 2 * h * w
            score_map_list.append(sub_score_map)
        score_map = torch.cat(score_map_list, dim=0)  # b * 2 * h * w
        x_orig[self.score_concat_index] = torch.cat(
            [x_orig[self.score_concat_index], score_map], dim=1)
        return x_orig, score_map

    def forward(self, img, text_list=None):
        if text_list is None:
            bsz = img.size()[0]
            text_list = ['foregeound'] * bsz
        x = self.extract_feat(img)
        _x_orig = [x[i] for i in range(4)]
        name_list = []
        for name in text_list:
            name_list.append('others')
            name_list.append(name[0:20])
        x_orig, score_map = self.after_extract_feat(x, name_list)
        x_orig = list(self.neck(x_orig))
        _x_orig = x_orig
        pred = self.head_fpn(_x_orig)
        return pred
--- a/modelscope/models/cv/shop_segmentation/shop_seg_model.py
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -0,0 +1,115 @@
 import os.path as osp
 from typing import Any, Dict

 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.shop_segmentation import SHOPSEG
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 __all__ = ['ShopSegmentation']


@MODELS.register_module(
    Tasks.shop_segmentation, module_name=Models.shop_segmentation)
 class ShopSegmentation(TorchModel):
    """ shop segmentation model.
    """

    def __init__(self, model_dir, device_id=0, *args, **kwargs):
        super().__init__(
            model_dir=model_dir, device_id=device_id, *args, **kwargs)

        self.model = SHOPSEG(model_dir=model_dir)
        pretrained_params = torch.load('{}/{}'.format(
            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))

        self.model.load_state_dict(pretrained_params)
        self.model.eval()
        self.device_id = device_id
        if self.device_id >= 0 and torch.cuda.is_available():
            self.model.to('cuda:{}'.format(self.device_id))
            logger.info('Use GPU: {}'.format(self.device_id))
        else:
            self.device_id = -1
            logger.info('Use CPU for inference')

    def preprocess(self, img, size=1024):
        mean = [0.48145466, 0.4578275, 0.40821073]
        std = [0.26862954, 0.26130258, 0.27577711]
        h, w, c = img.shape
        max_hw = max(h, w)
        ratio = 1.0 * size / max_hw
        crop_h, crop_w = int(ratio * h), int(ratio * w)
        pil_img = Image.fromarray(img)
        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
        np_img = np.array(pil_img, dtype=np.float32) / 255.

        for j in range(3):
            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]

        img_pad = np.zeros((size, size, 3), dtype=np.float32)
        img_pad[:crop_h, :crop_w] = np_img

        img_pad = torch.from_numpy(img_pad).permute(2, 0,
                                                    1).unsqueeze(0).float()
        return img_pad, h, w, crop_h, crop_w

    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
        output = np.clip(tensors * 255., a_min=0, a_max=255.)
        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)

        pil_output = Image.fromarray(crop_output)
        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
        np_output = np.array(pil_output, dtype=np.uint8)

        np_output[np_output < 128] = 0
        np_output[np_output >= 128] = 255
        np_output = np.uint8(np_output)
        return np_output

    def forward(self, image):
        """
        image should be numpy array, dtype=np.uint8, shape: height*width*3
        """
        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
            image, size=1024)
        pred = self.inference(image_tensor)
        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024)

        outputs = {OutputKeys.MASKS: msk}
        return outputs

    def inference(self, image):
        """
        image should be tensor, 1 * 3 * 1024 * 1024
        """
        with torch.no_grad():
            if self.device_id == -1:
                output = self.model(image)
            else:
                device = torch.device('cuda', self.device_id)
                output = self.model(image.to(device))
            output = F.interpolate(output, size=(1024, 1024), mode='bilinear')
            output = F.softmax(output, dim=1)
            output = torch.argmax(output, dim=1)
            output = output[0]
            if self.device_id == -1:
                pred = output.data.numpy()
            else:
                pred = output.data.cpu().numpy()

            del output
        return pred
--- a/modelscope/models/cv/shop_segmentation/utils.py
+++ b/modelscope/models/cv/shop_segmentation/utils.py
@@ -0,0 +1,199 @@
 """ CLIP Tokenizer
 Adapted from https://github.com/openai/CLIP.
 Originally MIT License, Copyright (c) 2021 OpenAI.
 """

 import gzip
 import html
 import os
 from functools import lru_cache
 from typing import Any, List, Union

 import ftfy
 import regex as re
 import torch


@lru_cache()
 def default_bpe():
    return os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'bpe_simple_vocab_16e6.txt.gz')


@lru_cache()
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord('!'),
                    ord('~') + 1)) + list(range(
                        ord('¡'),
                        ord('¬') + 1)) + list(range(ord('®'),
                                                    ord('ÿ') + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


 def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


 def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


 def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


 class SimpleTokenizer(object):

    def __init__(self, bpe_path: str = default_bpe()):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
        merges = merges[1:49152 - 256 - 2 + 1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v + '</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {
            '<|startoftext|>': '<|startoftext|>',
            '<|endoftext|>': '<|endoftext|>'
        }
        self.pat = re.compile(
            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
            re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + (token[-1] + '</w>', )
        pairs = get_pairs(word)

        if not pairs:
            return token + '</w>'

        error_list = []
        while True:
            bigram = min(
                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except Exception as err:
                    error_list.append(err)
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word) - 1 and word[
                        i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b]
                            for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token]
                              for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            'utf-8', errors='replace').replace('</w>', ' ')
        return text


 def tokenize(tokenizer,
             texts,
             context_length: int = 77,
             truncate: bool = False) -> torch.LongTensor:
    """
    Returns the tokenized representation of given input string(s)
    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize
    context_length : int
        The context length to use; all CLIP models use 77 as the context length
    truncate: bool
        Whether to truncate the text in case its encoding is longer than the context length
    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
    """
    if isinstance(texts, str):
        texts = [texts]

    sot_token = tokenizer.encoder['<|startoftext|>']
    eot_token = tokenizer.encoder['<|endoftext|>']
    all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
                  for text in texts]
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

    for i, tokens in enumerate(all_tokens):
        if len(tokens) > context_length:
            if truncate:
                tokens = tokens[:context_length]
                tokens[-1] = eot_token
            else:
                raise RuntimeError(
                    f'Input {texts[i]} is too long for context length {context_length}'
                )
        result[i, :len(tokens)] = torch.tensor(tokens)

    return result
--- a/modelscope/models/cv/text_driven_segmentation/init.py
+++ b/modelscope/models/cv/text_driven_segmentation/init.py
@@ -0,0 +1 @@
 from .lseg_base import TextDrivenSegmentation
--- a/modelscope/models/cv/text_driven_segmentation/clip.py
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -0,0 +1,170 @@
 """ CLIP
 Adapted from https://github.com/openai/CLIP.
 Originally MIT License, Copyright (c) 2021 OpenAI.
 """

 import hashlib
 import os
 import urllib
 import warnings
 from typing import Any, List, Union

 import torch
 from PIL import Image
 from pkg_resources import packaging
 from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
                                    ToTensor)
 from tqdm import tqdm

 from .model import build_model
 from .simple_tokenizer import SimpleTokenizer as _Tokenizer

 try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
 except ImportError:
    BICUBIC = Image.BICUBIC

 if packaging.version.parse(
        torch.__version__) < packaging.version.parse('1.7.1'):
    warnings.warn('PyTorch version 1.7.1 or higher is recommended')
 __all__ = ['load', 'tokenize']


 def _convert_image_to_rgb(image):
    return image.convert('RGB')


 def _transform(n_px):
    return Compose([
        Resize(n_px, interpolation=BICUBIC),
        CenterCrop(n_px),
        _convert_image_to_rgb,
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073),
                  (0.26862954, 0.26130258, 0.27577711)),
    ])


 def load(name: str,
         device: Union[str, torch.device] = 'cuda'
         if torch.cuda.is_available() else 'cpu',
         jit: bool = False,
         root: str = None):

    if not jit:
        model = build_model().to(device)
        if str(device) == 'cpu':
            model.float()
        return model, _transform(model.visual.input_resolution)

    # patch the device names
    device_holder = torch.jit.trace(
        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
    device_node = [
        n for n in device_holder.graph.findAllNodes('prim::Constant')
        if 'Device' in repr(n)
    ][-1]

    def patch_device(module):
        try:
            graphs = [module.graph] if hasattr(module, 'graph') else []
        except RuntimeError:
            graphs = []

        if hasattr(module, 'forward1'):
            graphs.append(module.forward1.graph)

        for graph in graphs:
            for node in graph.findAllNodes('prim::Constant'):
                if 'value' in node.attributeNames() and str(
                        node['value']).startswith('cuda'):
                    node.copyAttributes(device_node)

    model.apply(patch_device)
    patch_device(model.encode_image)
    patch_device(model.encode_text)

    # patch dtype to float32 on CPU
    if str(device) == 'cpu':
        float_holder = torch.jit.trace(
            lambda: torch.ones([]).float(), example_inputs=[])
        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
        float_node = float_input.node()

        def patch_float(module):
            try:
                graphs = [module.graph] if hasattr(module, 'graph') else []
            except RuntimeError:
                graphs = []

            if hasattr(module, 'forward1'):
                graphs.append(module.forward1.graph)

            for graph in graphs:
                for node in graph.findAllNodes('aten::to'):
                    inputs = list(node.inputs())
                    for i in [
                            1, 2
                    ]:  # dtype can be the second or third argument to aten::to()
                        if inputs[i].node()['value'] == 5:
                            inputs[i].node().copyAttributes(float_node)

        model.apply(patch_float)
        patch_float(model.encode_image)
        patch_float(model.encode_text)

        model.float()

    return model, _transform(model.input_resolution.item())


 def tokenize(
        _tokenizer,
        texts: Union[str, List[str]],
        context_length: int = 77,
        truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
    """
    Returns the tokenized representation of given input string(s)

    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize

    context_length : int
        The context length to use; all CLIP models use 77 as the context length

    truncate: bool
        Whether to truncate the text in case its encoding is longer than the context length

    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
    """
    if isinstance(texts, str):
        texts = [texts]

    sot_token = _tokenizer.encoder['<|startoftext|>']
    eot_token = _tokenizer.encoder['<|endoftext|>']
    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
                  for text in texts]
    if packaging.version.parse(
            torch.__version__) < packaging.version.parse('1.8.0'):
        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
    else:
        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)

    for i, tokens in enumerate(all_tokens):
        if len(tokens) > context_length:
            if truncate:
                tokens = tokens[:context_length]
                tokens[-1] = eot_token
            else:
                raise RuntimeError(
                    f'Input {texts[i]} is too long for context length {context_length}'
                )
        result[i, :len(tokens)] = torch.tensor(tokens)

    return result
--- a/modelscope/models/cv/text_driven_segmentation/lseg_base.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
@@ -0,0 +1,28 @@
 """
 Adapted from https://github.com/isl-org/lang-seg.
 Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 """

 import torch
 import torch.nn as nn

 from .lseg_net import LSeg


 class TextDrivenSegmentation(nn.Module):

    def __init__(self, model_dir):
        super(TextDrivenSegmentation, self).__init__()
        self.net = LSeg(model_dir=model_dir)
        self.model_dir = model_dir

    def forward(self, img, txt_list):
        b = img.size()[0]
        batch_name_list = txt_list
        xout_list = []
        for i in range(b):
            labelset = ['others', batch_name_list[i]]
            xout = self.net(img[i:i + 1], labelset=labelset)
            xout_list.append(xout)
        score_map = torch.cat(xout_list, dim=0)
        return score_map
--- a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
@@ -0,0 +1,334 @@
 """
 Adapted from https://github.com/isl-org/lang-seg.
 Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 """

 import torch
 import torch.nn as nn

 from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit


 def _make_encoder(
    backbone,
    features,
    use_pretrained=True,
    groups=1,
    expand=False,
    exportable=True,
    hooks=None,
    use_vit_only=False,
    use_readout='ignore',
    enable_attention_hooks=False,
 ):
    if backbone == 'clip_vitl16_384':
        clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384(
            use_pretrained,
            hooks=hooks,
            use_readout=use_readout,
            enable_attention_hooks=enable_attention_hooks,
        )
        scratch = _make_scratch([256, 512, 1024, 1024],
                                features,
                                groups=groups,
                                expand=expand)
    else:
        raise NotImplementedError(f"Backbone '{backbone}' not implemented")

    return clip_pretrained, pretrained, scratch


 def _make_scratch(in_shape, out_shape, groups=1, expand=False):
    scratch = nn.Module()

    out_shape1 = out_shape
    out_shape2 = out_shape
    out_shape3 = out_shape
    out_shape4 = out_shape
    if expand is True:
        out_shape1 = out_shape
        out_shape2 = out_shape * 2
        out_shape3 = out_shape * 4
        out_shape4 = out_shape * 8

    scratch.layer1_rn = nn.Conv2d(
        in_shape[0],
        out_shape1,
        kernel_size=3,
        stride=1,
        padding=1,
        bias=False,
        groups=groups,
    )
    scratch.layer2_rn = nn.Conv2d(
        in_shape[1],
        out_shape2,
        kernel_size=3,
        stride=1,
        padding=1,
        bias=False,
        groups=groups,
    )
    scratch.layer3_rn = nn.Conv2d(
        in_shape[2],
        out_shape3,
        kernel_size=3,
        stride=1,
        padding=1,
        bias=False,
        groups=groups,
    )
    scratch.layer4_rn = nn.Conv2d(
        in_shape[3],
        out_shape4,
        kernel_size=3,
        stride=1,
        padding=1,
        bias=False,
        groups=groups,
    )

    return scratch


 class Interpolate(nn.Module):
    """Interpolation module."""

    def __init__(self, scale_factor, mode, align_corners=False):
        """Init.

        Args:
            scale_factor (float): scaling
            mode (str): interpolation mode
        """
        super(Interpolate, self).__init__()

        self.interp = nn.functional.interpolate
        self.scale_factor = scale_factor
        self.mode = mode
        self.align_corners = align_corners

    def forward(self, x):
        """Forward pass.

        Args:
            x (tensor): input

        Returns:
            tensor: interpolated data
        """

        x = self.interp(
            x,
            scale_factor=self.scale_factor,
            mode=self.mode,
            align_corners=self.align_corners,
        )

        return x


 class ResidualConvUnit(nn.Module):
    """Residual convolution module."""

    def __init__(self, features):
        """Init.

        Args:
            features (int): number of features
        """
        super().__init__()

        self.conv1 = nn.Conv2d(
            features, features, kernel_size=3, stride=1, padding=1, bias=True)

        self.conv2 = nn.Conv2d(
            features, features, kernel_size=3, stride=1, padding=1, bias=True)

        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        """Forward pass.

        Args:
            x (tensor): input

        Returns:
            tensor: output
        """
        out = self.relu(x)
        out = self.conv1(out)
        out = self.relu(out)
        out = self.conv2(out)

        return out + x


 class FeatureFusionBlock(nn.Module):
    """Feature fusion block."""

    def __init__(self, features):
        """Init.

        Args:
            features (int): number of features
        """
        super(FeatureFusionBlock, self).__init__()

        self.resConfUnit1 = ResidualConvUnit(features)
        self.resConfUnit2 = ResidualConvUnit(features)

    def forward(self, *xs):
        """Forward pass.

        Returns:
            tensor: output
        """
        output = xs[0]

        if len(xs) == 2:
            output += self.resConfUnit1(xs[1])

        output = self.resConfUnit2(output)

        output = nn.functional.interpolate(
            output, scale_factor=2, mode='bilinear', align_corners=True)

        return output


 class ResidualConvUnit_custom(nn.Module):
    """Residual convolution module."""

    def __init__(self, features, activation, bn):
        """Init.

        Args:
            features (int): number of features
        """
        super().__init__()

        self.bn = bn

        self.groups = 1

        self.conv1 = nn.Conv2d(
            features,
            features,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=not self.bn,
            groups=self.groups,
        )

        self.conv2 = nn.Conv2d(
            features,
            features,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=not self.bn,
            groups=self.groups,
        )

        if self.bn is True:
            self.bn1 = nn.BatchNorm2d(features)
            self.bn2 = nn.BatchNorm2d(features)

        self.activation = activation

        self.skip_add = nn.quantized.FloatFunctional()

    def forward(self, x):
        """Forward pass.

        Args:
            x (tensor): input

        Returns:
            tensor: output
        """

        out = self.activation(x)
        out = self.conv1(out)
        if self.bn is True:
            out = self.bn1(out)

        out = self.activation(out)
        out = self.conv2(out)
        if self.bn is True:
            out = self.bn2(out)

        if self.groups > 1:
            out = self.conv_merge(out)

        return self.skip_add.add(out, x)


 class FeatureFusionBlock_custom(nn.Module):
    """Feature fusion block."""

    def __init__(
        self,
        features,
        activation,
        deconv=False,
        bn=False,
        expand=False,
        align_corners=True,
    ):
        """Init.

        Args:
            features (int): number of features
        """
        super(FeatureFusionBlock_custom, self).__init__()

        self.deconv = deconv
        self.align_corners = align_corners

        self.groups = 1

        self.expand = expand
        out_features = features
        if self.expand is True:
            out_features = features // 2

        self.out_conv = nn.Conv2d(
            features,
            out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
            groups=1,
        )

        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)

        self.skip_add = nn.quantized.FloatFunctional()

    def forward(self, *xs):
        """Forward pass.

        Returns:
            tensor: output
        """
        output = xs[0]

        if len(xs) == 2:
            res = self.resConfUnit1(xs[1])
            output = self.skip_add.add(output, res)

        output = self.resConfUnit2(output)

        output = nn.functional.interpolate(
            output,
            scale_factor=2,
            mode='bilinear',
            align_corners=self.align_corners)

        output = self.out_conv(output)
        return output
--- a/modelscope/models/cv/text_driven_segmentation/lseg_model.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -0,0 +1,107 @@
 import os.path as osp
 from typing import Any, Dict

 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image

 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.text_driven_segmentation import \
    TextDrivenSegmentation
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()
 __all__ = ['TextDrivenSeg']


@MODELS.register_module(
    Tasks.text_driven_segmentation,
    module_name=Models.text_driven_segmentation)
 class TextDrivenSeg(TorchModel):
    """ text driven segmentation model.
    """

    def __init__(self, model_dir, device_id=0, *args, **kwargs):
        super().__init__(
            model_dir=model_dir, device_id=device_id, *args, **kwargs)
        self.model = TextDrivenSegmentation(model_dir=model_dir)
        pretrained_params = torch.load('{}/{}'.format(
            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
        self.model.load_state_dict(pretrained_params)
        self.model.eval()
        if device_id >= 0 and torch.cuda.is_available():
            self.model.to('cuda:{}'.format(device_id))
            logger.info('Use GPU: {}'.format(device_id))
        else:
            device_id = -1
            logger.info('Use CPU for inference')
        self.device_id = device_id

    def preprocess(self, img, size=640):
        mean = [0.48145466, 0.4578275, 0.40821073]
        std = [0.26862954, 0.26130258, 0.27577711]
        h, w, c = img.shape
        max_hw = max(h, w)
        ratio = 1.0 * size / max_hw
        crop_h, crop_w = int(ratio * h), int(ratio * w)
        pil_img = Image.fromarray(img)
        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
        np_img = np.array(pil_img, dtype=np.float32) / 255.
        for j in range(3):
            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
        img_pad = np.zeros((size, size, 3), dtype=np.float32)
        img_pad[:crop_h, :crop_w] = np_img
        img_pad = torch.from_numpy(img_pad).permute(2, 0,
                                                    1).unsqueeze(0).float()
        return img_pad, h, w, crop_h, crop_w

    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
        output = np.clip(tensors * 255., a_min=0, a_max=255.)
        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
        pil_output = Image.fromarray(crop_output)
        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
        np_output = np.array(pil_output, dtype=np.uint8)
        np_output[np_output < 128] = 0
        np_output[np_output >= 128] = 255
        np_output = np.uint8(np_output)
        return np_output

    def forward(self, image, text):
        """
        image should be numpy array, dtype=np.uint8, shape: height*width*3
        """
        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
            image, size=640)
        pred = self.inference(image_tensor, text)
        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640)
        outputs = {OutputKeys.MASKS: msk}
        return outputs

    def inference(self, image, text):
        """
        image should be tensor, 1 * 3 * 640 * 640
        """
        with torch.no_grad():
            if self.device_id == -1:
                output = self.model(image)
            else:
                device = torch.device('cuda', self.device_id)
                output = self.model(image.to(device), [text])
            output = F.interpolate(output, size=(640, 640), mode='bilinear')
            output = F.softmax(output, dim=1)
            output = torch.argmax(output, dim=1)
            output = output[0]
            if self.device_id == -1:
                pred = output.data.numpy()
            else:
                pred = output.data.cpu().numpy()
            del output
        return pred
--- a/modelscope/models/cv/text_driven_segmentation/lseg_net.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
@@ -0,0 +1,197 @@
 """
 Adapted from https://github.com/isl-org/lang-seg.
 Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 """

 import numpy as np
 import torch
 import torch.nn as nn

 from . import clip
 from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
                          Interpolate, _make_encoder, forward_vit)
 from .simple_tokenizer import SimpleTokenizer


 class depthwise_clipseg_conv(nn.Module):

    def __init__(self):
        super(depthwise_clipseg_conv, self).__init__()
        self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1)

    def depthwise_clipseg(self, x, channels):
        x = torch.cat(
            [self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)],
            dim=1)
        return x

    def forward(self, x):
        channels = x.shape[1]
        out = self.depthwise_clipseg(x, channels)
        return out


 class depthwise_conv(nn.Module):

    def __init__(self, kernel_size=3, stride=1, padding=1):
        super(depthwise_conv, self).__init__()
        self.depthwise = nn.Conv2d(
            1, 1, kernel_size=kernel_size, stride=stride, padding=padding)

    def forward(self, x):
        # support for 4D tensor with NCHW
        C, H, W = x.shape[1:]
        x = x.reshape(-1, 1, H, W)
        x = self.depthwise(x)
        x = x.view(-1, C, H, W)
        return x


 class depthwise_block(nn.Module):

    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
        super(depthwise_block, self).__init__()
        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'lrelu':
            self.activation = nn.LeakyReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()

    def forward(self, x, act=True):
        x = self.depthwise(x)
        if act:
            x = self.activation(x)
        return x


 class bottleneck_block(nn.Module):

    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
        super(bottleneck_block, self).__init__()
        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'lrelu':
            self.activation = nn.LeakyReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()

    def forward(self, x, act=True):
        sum_layer = x.max(dim=1, keepdim=True)[0]
        x = self.depthwise(x)
        x = x + sum_layer
        if act:
            x = self.activation(x)
        return x


 class BaseModel(torch.nn.Module):

    def load(self, path):
        """Load model from file.
        Args:
            path (str): file path
        """
        parameters = torch.load(path, map_location=torch.device('cpu'))

        if 'optimizer' in parameters:
            parameters = parameters['model']

        self.load_state_dict(parameters)


 def _make_fusion_block(features, use_bn):
    return FeatureFusionBlock_custom(
        features,
        activation=nn.ReLU(False),
        deconv=False,
        bn=use_bn,
        expand=False,
        align_corners=True,
    )


 class LSeg(BaseModel):

    def __init__(
        self,
        features=256,
        backbone='clip_vitl16_384',
        readout='project',
        use_bn=True,
        model_dir=None,
    ):
        super(LSeg, self).__init__()
        hooks = {
            'clip_vitl16_384': [5, 11, 17, 23],
        }

        # Instantiate backbone and reassemble blocks
        self.clip_pretrained, self.pretrained, self.scratch = _make_encoder(
            backbone,
            features,
            groups=1,
            expand=False,
            exportable=False,
            hooks=hooks[backbone],
            use_readout=readout,
        )

        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)

        self.logit_scale = nn.Parameter(torch.ones([])
                                        * np.log(1 / 0.07)).exp()
        self.out_c = 512
        self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1)

        self.scratch.output_conv = nn.Sequential(
            Interpolate(scale_factor=2, mode='bilinear', align_corners=True), )

        self.tau = 0.07
        self.model_dir = model_dir
        self.tokenizer = SimpleTokenizer(model_dir
                                         + '/bpe_simple_vocab_16e6.txt.gz')

    def forward(self, x, labelset=''):
        text = clip.tokenize(self.tokenizer, labelset)

        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)

        layer_1_rn = self.scratch.layer1_rn(layer_1)
        layer_2_rn = self.scratch.layer2_rn(layer_2)
        layer_3_rn = self.scratch.layer3_rn(layer_3)
        layer_4_rn = self.scratch.layer4_rn(layer_4)

        path_4 = self.scratch.refinenet4(layer_4_rn)
        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)

        text = text.to(x.device)
        text_features = self.clip_pretrained.encode_text(text)

        image_features = self.scratch.head1(path_1)

        imshape = image_features.shape
        image_features = image_features.permute(0, 2, 3,
                                                1).reshape(-1, self.out_c)

        # normalized features
        image_features = image_features / image_features.norm(
            dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(
            dim=-1, keepdim=True)

        logits_per_image = image_features @ text_features.t() / self.tau

        out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3],
                                            -1).permute(0, 3, 1, 2)

        out = self.scratch.output_conv(out)

        return out
--- a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
@@ -0,0 +1,543 @@
 """
 Adapted from https://github.com/isl-org/lang-seg.
 Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 """

 import math
 import types

 import timm
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint

 from . import clip

 activations = {}


 def get_activation(name):

    def hook(model, input, output):
        activations[name] = output

    return hook


 attention = {}


 def get_attention(name):

    def hook(module, input, output):
        x = input[0]
        B, N, C = x.shape
        qkv = (
            module.qkv(x).reshape(B, N, 3, module.num_heads,
                                  C // module.num_heads).permute(
                                      2, 0, 3, 1, 4))
        q, k, _ = (
            qkv[0],
            qkv[1],
            qkv[2],
        )  # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose(-2, -1)) * module.scale

        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
        attention[name] = attn

    return hook


 def get_mean_attention_map(attn, token, shape):
    attn = attn[:, :, token, 1:]
    attn = attn.unflatten(2, torch.Size([shape[2] // 16,
                                         shape[3] // 16])).float()
    attn = torch.nn.functional.interpolate(
        attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0)

    all_attn = torch.mean(attn, 0)

    return all_attn


 class Slice(nn.Module):

    def __init__(self, start_index=1):
        super(Slice, self).__init__()
        self.start_index = start_index

    def forward(self, x):
        return x[:, self.start_index:]


 class AddReadout(nn.Module):

    def __init__(self, start_index=1):
        super(AddReadout, self).__init__()
        self.start_index = start_index

    def forward(self, x):
        if self.start_index == 2:
            readout = (x[:, 0] + x[:, 1]) / 2
        else:
            readout = x[:, 0]
        return x[:, self.start_index:] + readout.unsqueeze(1)


 class ProjectReadout(nn.Module):

    def __init__(self, in_features, start_index=1):
        super(ProjectReadout, self).__init__()
        self.start_index = start_index

        self.project = nn.Sequential(
            nn.Linear(2 * in_features, in_features), nn.GELU())

    def forward(self, x):
        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
        features = torch.cat((x[:, self.start_index:], readout), -1)

        return self.project(features)


 class Transpose(nn.Module):

    def __init__(self, dim0, dim1):
        super(Transpose, self).__init__()
        self.dim0 = dim0
        self.dim1 = dim1

    def forward(self, x):
        x = x.transpose(self.dim0, self.dim1)
        return x


 def forward_vit(pretrained, x):
    b, c, h, w = x.shape

    # encoder
    _ = pretrained.model.forward_flex(x)

    layer_1 = pretrained.activations['1']
    layer_2 = pretrained.activations['2']
    layer_3 = pretrained.activations['3']
    layer_4 = pretrained.activations['4']

    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
    layer_4 = pretrained.act_postprocess4[0:2](layer_4)

    unflatten = nn.Sequential(
        nn.Unflatten(
            2,
            torch.Size([
                h // pretrained.model.patch_size[1],
                w // pretrained.model.patch_size[0],
            ]),
        ))

    if layer_1.ndim == 3:
        layer_1 = unflatten(layer_1)
    if layer_2.ndim == 3:
        layer_2 = unflatten(layer_2)
    if layer_3.ndim == 3:
        layer_3 = unflatten(layer_3)
    if layer_4.ndim == 3:
        layer_4 = unflatten(layer_4)

    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
        layer_1)
    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
        layer_2)
    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
        layer_3)
    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
        layer_4)

    return layer_1, layer_2, layer_3, layer_4


 def _resize_pos_embed(self, posemb, gs_h, gs_w):
    posemb_tok, posemb_grid = (
        posemb[:, :self.start_index],
        posemb[0, self.start_index:],
    )

    gs_old = int(math.sqrt(len(posemb_grid)))

    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
                                      -1).permute(0, 3, 1, 2)
    posemb_grid = F.interpolate(
        posemb_grid, size=(gs_h, gs_w), mode='bilinear')
    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)

    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)

    return posemb


 def forward_flex(self, x):
    b, c, h, w = x.shape

    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
                                       w // self.patch_size[0])

    B = x.shape[0]

    if hasattr(self.patch_embed, 'backbone'):
        x = self.patch_embed.backbone(x)
        if isinstance(x, (list, tuple)):
            x = x[
                -1]  # last feature if backbone outputs list/tuple of features
    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)

    if getattr(self, 'dist_token', None) is not None:
        cls_tokens = self.cls_token.expand(
            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        dist_token = self.dist_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, dist_token, x), dim=1)
    else:
        cls_tokens = self.cls_token.expand(
            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
        x = torch.cat((cls_tokens, x), dim=1)

    x = x + pos_embed
    x = self.pos_drop(x)

    gradient_checkpoint = False
    for blk in self.blocks:
        if gradient_checkpoint:
            x = checkpoint.checkpoint(blk, x)
        else:
            x = blk(x)

    x = self.norm(x)

    return x


 def get_readout_oper(vit_features, features, use_readout, start_index=1):
    if use_readout == 'ignore':
        readout_oper = [Slice(start_index)] * len(features)
    elif use_readout == 'add':
        readout_oper = [AddReadout(start_index)] * len(features)
    elif use_readout == 'project':
        readout_oper = [
            ProjectReadout(vit_features, start_index) for out_feat in features
        ]
    else:
        assert (
            False
        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"

    return readout_oper


 def adapt_input_conv(in_chans, conv_weight):
    conv_type = conv_weight.dtype
    conv_weight = conv_weight.float(
    )  # Some weights are in torch.half, ensure it's float for sum on CPU
    O, II, J, K = conv_weight.shape
    if in_chans == 1:
        if II > 3:
            assert conv_weight.shape[1] % 3 == 0
            # For models with space2depth stems
            conv_weight = conv_weight.reshape(O, II // 3, 3, J, K)
            conv_weight = conv_weight.sum(dim=2, keepdim=False)
        else:
            conv_weight = conv_weight.sum(dim=1, keepdim=True)
    elif in_chans != 3:
        if II != 3:
            raise NotImplementedError(
                'Weight format not supported by conversion.')
        else:
            # NOTE this strategy should be better than random init, but there could be other combinations of
            # the original RGB input layer weights that'd work better for specific cases.
            repeat = int(math.ceil(in_chans / 3))
            conv_weight = conv_weight.repeat(1, repeat, 1,
                                             1)[:, :in_chans, :, :]
            conv_weight *= (3 / float(in_chans))
    conv_weight = conv_weight.to(conv_type)
    return conv_weight


@torch.no_grad()
 def _load_weights(model, checkpoint_path, prefix=''):
    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
    """
    import numpy as np

    def _n2p(w, t=True):
        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
            w = w.flatten()
        if t:
            if w.ndim == 4:
                w = w.transpose([3, 2, 0, 1])
            elif w.ndim == 3:
                w = w.transpose([2, 0, 1])
            elif w.ndim == 2:
                w = w.transpose([1, 0])
        return torch.from_numpy(w)

    w = np.load(checkpoint_path)
    if not prefix and 'opt/target/embedding/kernel' in w:
        prefix = 'opt/target/'

    if hasattr(model.patch_embed, 'backbone'):
        # hybrid
        backbone = model.patch_embed.backbone
        stem_only = not hasattr(backbone, 'stem')
        stem = backbone if stem_only else backbone.stem
        stem.conv.weight.copy_(
            adapt_input_conv(stem.conv.weight.shape[1],
                             _n2p(w[f'{prefix}conv_root/kernel'])))
        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
        if not stem_only:
            for i, stage in enumerate(backbone.stages):
                for j, block in enumerate(stage.blocks):
                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
                    for r in range(3):
                        getattr(block, f'conv{r + 1}').weight.copy_(
                            _n2p(w[f'{bp}conv{r + 1}/kernel']))
                        getattr(block, f'norm{r + 1}').weight.copy_(
                            _n2p(w[f'{bp}gn{r + 1}/scale']))
                        getattr(block, f'norm{r + 1}').bias.copy_(
                            _n2p(w[f'{bp}gn{r + 1}/bias']))
                    if block.downsample is not None:
                        block.downsample.conv.weight.copy_(
                            _n2p(w[f'{bp}conv_proj/kernel']))
                        block.downsample.norm.weight.copy_(
                            _n2p(w[f'{bp}gn_proj/scale']))
                        block.downsample.norm.bias.copy_(
                            _n2p(w[f'{bp}gn_proj/bias']))
        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
    else:
        embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1],
                                        _n2p(w[f'{prefix}embedding/kernel']))
    model.patch_embed.proj.weight.copy_(embed_conv_w)
    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
    pos_embed_w = _n2p(
        w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
    if pos_embed_w.shape != model.pos_embed.shape:
        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
            pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens',
                                                  1),
            model.patch_embed.grid_size)
    model.pos_embed.copy_(pos_embed_w)
    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
    if isinstance(
            model.head, nn.Linear
    ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
        model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
        model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
    # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
    # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
    #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
    #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
    for i, block in enumerate(model.blocks.children()):
        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
        block.attn.qkv.weight.copy_(
            torch.cat([
                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T
                for n in ('query', 'key', 'value')
            ]))
        block.attn.qkv.bias.copy_(
            torch.cat([
                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1)
                for n in ('query', 'key', 'value')
            ]))
        block.attn.proj.weight.copy_(
            _n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
        for r in range(2):
            getattr(block.mlp, f'fc{r + 1}').weight.copy_(
                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
            getattr(block.mlp, f'fc{r + 1}').bias.copy_(
                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))


 def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
    ntok_new = posemb_new.shape[1]
    if num_prefix_tokens:
        posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[
            0, num_prefix_tokens:]
        ntok_new -= num_prefix_tokens
    else:
        posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
    gs_old = int(math.sqrt(len(posemb_grid)))
    if not len(gs_new):  # backwards compatibility
        gs_new = [int(math.sqrt(ntok_new))] * 2
    assert len(gs_new) >= 2
    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
                                      -1).permute(0, 3, 1, 2)
    posemb_grid = F.interpolate(
        posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
    posemb_grid = posemb_grid.permute(0, 2, 3,
                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
    posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
    return posemb


 def _make_pretrained_clip_vitl16_384(pretrained,
                                     use_readout='ignore',
                                     hooks=None,
                                     enable_attention_hooks=False):
    clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False)

    # model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
    model = timm.create_model('vit_large_patch16_384', pretrained=False)
    hooks = [5, 11, 17, 23] if hooks is None else hooks
    pretrained = _make_vit_b16_backbone(
        model,
        features=[256, 512, 1024, 1024],
        hooks=hooks,
        vit_features=1024,
        use_readout=use_readout,
        enable_attention_hooks=enable_attention_hooks,
    )
    return clip_pretrained, pretrained


 def _make_vit_b16_backbone(
    model,
    features=[96, 192, 384, 768],
    size=[384, 384],
    hooks=[2, 5, 8, 11],
    vit_features=768,
    use_readout='ignore',
    start_index=1,
    enable_attention_hooks=False,
 ):
    pretrained = nn.Module()

    pretrained.model = model
    pretrained.model.blocks[hooks[0]].register_forward_hook(
        get_activation('1'))
    pretrained.model.blocks[hooks[1]].register_forward_hook(
        get_activation('2'))
    pretrained.model.blocks[hooks[2]].register_forward_hook(
        get_activation('3'))
    pretrained.model.blocks[hooks[3]].register_forward_hook(
        get_activation('4'))

    pretrained.activations = activations

    if enable_attention_hooks:
        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
            get_attention('attn_1'))
        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
            get_attention('attn_2'))
        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
            get_attention('attn_3'))
        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
            get_attention('attn_4'))
        pretrained.attention = attention

    readout_oper = get_readout_oper(vit_features, features, use_readout,
                                    start_index)

    # 32, 48, 136, 384
    pretrained.act_postprocess1 = nn.Sequential(
        readout_oper[0],
        Transpose(1, 2),
        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
        nn.Conv2d(
            in_channels=vit_features,
            out_channels=features[0],
            kernel_size=1,
            stride=1,
            padding=0,
        ),
        nn.ConvTranspose2d(
            in_channels=features[0],
            out_channels=features[0],
            kernel_size=4,
            stride=4,
            padding=0,
            bias=True,
            dilation=1,
            groups=1,
        ),
    )

    pretrained.act_postprocess2 = nn.Sequential(
        readout_oper[1],
        Transpose(1, 2),
        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
        nn.Conv2d(
            in_channels=vit_features,
            out_channels=features[1],
            kernel_size=1,
            stride=1,
            padding=0,
        ),
        nn.ConvTranspose2d(
            in_channels=features[1],
            out_channels=features[1],
            kernel_size=2,
            stride=2,
            padding=0,
            bias=True,
            dilation=1,
            groups=1,
        ),
    )

    pretrained.act_postprocess3 = nn.Sequential(
        readout_oper[2],
        Transpose(1, 2),
        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
        nn.Conv2d(
            in_channels=vit_features,
            out_channels=features[2],
            kernel_size=1,
            stride=1,
            padding=0,
        ),
    )

    pretrained.act_postprocess4 = nn.Sequential(
        readout_oper[3],
        Transpose(1, 2),
        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
        nn.Conv2d(
            in_channels=vit_features,
            out_channels=features[3],
            kernel_size=1,
            stride=1,
            padding=0,
        ),
        nn.Conv2d(
            in_channels=features[3],
            out_channels=features[3],
            kernel_size=3,
            stride=2,
            padding=1,
        ),
    )

    pretrained.model.start_index = start_index
    pretrained.model.patch_size = [16, 16]

    # We inject this function into the VisionTransformer instances so that
    # we can use it with interpolated position embeddings without modifying the library source.
    pretrained.model.forward_flex = types.MethodType(forward_flex,
                                                     pretrained.model)
    pretrained.model._resize_pos_embed = types.MethodType(
        _resize_pos_embed, pretrained.model)

    return pretrained
--- a/modelscope/models/cv/text_driven_segmentation/model.py
+++ b/modelscope/models/cv/text_driven_segmentation/model.py
@@ -0,0 +1,458 @@
 """
 Adapted from https://github.com/isl-org/lang-seg.
 Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
 """

 from collections import OrderedDict
 from typing import Tuple, Union

 import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn


 class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1):
        super().__init__()

        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu1 = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.relu2 = nn.ReLU(inplace=True)

        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu3 = nn.ReLU(inplace=True)

        self.downsample = None
        self.stride = stride

        if stride > 1 or inplanes != planes * Bottleneck.expansion:
            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
            self.downsample = nn.Sequential(
                OrderedDict([('-1', nn.AvgPool2d(stride)),
                             ('0',
                              nn.Conv2d(
                                  inplanes,
                                  planes * self.expansion,
                                  1,
                                  stride=1,
                                  bias=False)),
                             ('1', nn.BatchNorm2d(planes * self.expansion))]))

    def forward(self, x: torch.Tensor):
        identity = x

        out = self.relu1(self.bn1(self.conv1(x)))
        out = self.relu2(self.bn2(self.conv2(out)))
        out = self.avgpool(out)
        out = self.bn3(self.conv3(out))

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu3(out)
        return out


 class AttentionPool2d(nn.Module):

    def __init__(self,
                 spacial_dim: int,
                 embed_dim: int,
                 num_heads: int,
                 output_dim: int = None):
        super().__init__()
        self.positional_embedding = nn.Parameter(
            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
        self.num_heads = num_heads

    def forward(self, x):
        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
        x, _ = F.multi_head_attention_forward(
            query=x[:1],
            key=x,
            value=x,
            embed_dim_to_check=x.shape[-1],
            num_heads=self.num_heads,
            q_proj_weight=self.q_proj.weight,
            k_proj_weight=self.k_proj.weight,
            v_proj_weight=self.v_proj.weight,
            in_proj_weight=None,
            in_proj_bias=torch.cat(
                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
            bias_k=None,
            bias_v=None,
            add_zero_attn=False,
            dropout_p=0,
            out_proj_weight=self.c_proj.weight,
            out_proj_bias=self.c_proj.bias,
            use_separate_proj_weight=True,
            training=self.training,
            need_weights=False)
        return x.squeeze(0)


 class ModifiedResNet(nn.Module):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self,
                 layers,
                 output_dim,
                 heads,
                 input_resolution=224,
                 width=64):
        super().__init__()
        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv2d(
            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width // 2)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(
            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(width // 2)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv2d(
            width // 2, width, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(width)
        self.relu3 = nn.ReLU(inplace=True)
        self.avgpool = nn.AvgPool2d(2)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

        embed_dim = width * 32  # the ResNet feature dimension
        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
                                        heads, output_dim)

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):

        def stem(x):
            x = self.relu1(self.bn1(self.conv1(x)))
            x = self.relu2(self.bn2(self.conv2(x)))
            x = self.relu3(self.bn3(self.conv3(x)))
            x = self.avgpool(x)
            return x

        x = x.type(self.conv1.weight.dtype)
        x = stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.attnpool(x)

        return x


 class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


 class QuickGELU(nn.Module):

    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


 class ResidualAttentionBlock(nn.Module):

    def __init__(self,
                 d_model: int,
                 n_head: int,
                 attn_mask: torch.Tensor = None):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(
            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
                         ('gelu', QuickGELU()),
                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(
            dtype=x.dtype,
            device=x.device) if self.attn_mask is not None else None
        return self.attn(
            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


 class Transformer(nn.Module):

    def __init__(self, width, layers, heads, attn_mask=None):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[
            ResidualAttentionBlock(width, heads, attn_mask)
            for _ in range(layers)
        ])

    def forward(self, x: torch.Tensor):
        return self.resblocks(x)


 class VisionTransformer(nn.Module):

    def __init__(self, input_resolution: int, patch_size: int, width: int,
                 layers: int, heads: int, output_dim: int):
        super().__init__()
        self.input_resolution = input_resolution
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=width,
            kernel_size=patch_size,
            stride=patch_size,
            bias=False)

        scale = width**-0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn(
            (input_resolution // patch_size)**2 + 1, width))
        self.ln_pre = LayerNorm(width)

        self.transformer = Transformer(width, layers, heads)

        self.ln_post = LayerNorm(width)
        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

    def forward(self, x: torch.Tensor):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        x = x.reshape(x.shape[0], x.shape[1],
                      -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x1 = self.class_embedding.to(x.dtype)
        x2 = torch.zeros(
            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
        x = torch.cat([x1 + x2, x], dim=1)  # shape = [*, grid ** 2 + 1, width]
        x = x + self.positional_embedding.to(x.dtype)
        x = self.ln_pre(x)

        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD

        x = self.ln_post(x[:, 0, :])

        if self.proj is not None:
            x = x @ self.proj

        return x


 class CLIP(nn.Module):

    def __init__(
            self,
            embed_dim: int,
            # vision
            image_resolution: int,
            vision_layers: Union[Tuple[int, int, int, int], int],
            vision_width: int,
            vision_patch_size: int,
            # text
            context_length: int,
            vocab_size: int,
            transformer_width: int,
            transformer_heads: int,
            transformer_layers: int):
        super().__init__()

        self.context_length = context_length

        if isinstance(vision_layers, (tuple, list)):
            vision_heads = vision_width * 32 // 64
            self.visual = ModifiedResNet(
                layers=vision_layers,
                output_dim=embed_dim,
                heads=vision_heads,
                input_resolution=image_resolution,
                width=vision_width)
        else:
            vision_heads = vision_width // 64
            self.visual = VisionTransformer(
                input_resolution=image_resolution,
                patch_size=vision_patch_size,
                width=vision_width,
                layers=vision_layers,
                heads=vision_heads,
                output_dim=embed_dim)

        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            attn_mask=self.build_attention_mask())

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = nn.Parameter(
            torch.empty(self.context_length, transformer_width))
        self.ln_final = LayerNorm(transformer_width)

        self.text_projection = nn.Parameter(
            torch.empty(transformer_width, embed_dim))
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

        self.initialize_parameters()

    def initialize_parameters(self):
        nn.init.normal_(self.token_embedding.weight, std=0.02)
        nn.init.normal_(self.positional_embedding, std=0.01)

        if isinstance(self.visual, ModifiedResNet):
            if self.visual.attnpool is not None:
                std = self.visual.attnpool.c_proj.in_features**-0.5
                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

            for resnet_block in [
                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
                    self.visual.layer4
            ]:
                for name, param in resnet_block.named_parameters():
                    if name.endswith('bn3.weight'):
                        nn.init.zeros_(param)

        proj_std = (self.transformer.width**-0.5) * (
            (2 * self.transformer.layers)**-0.5)
        attn_std = self.transformer.width**-0.5
        fc_std = (2 * self.transformer.width)**-0.5
        for block in self.transformer.resblocks:
            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)

        if self.text_projection is not None:
            nn.init.normal_(
                self.text_projection, std=self.transformer.width**-0.5)

    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float('-inf'))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    @property
    def dtype(self):
        return self.visual.conv1.weight.dtype

    def encode_image(self, image):
        return self.visual(image.type(self.dtype))

    def encode_text(self, text):
        x = self.token_embedding(text).type(self.dtype)
        x = x + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x).type(self.dtype)
        x = x[torch.arange(x.shape[0]),
              text.argmax(dim=-1)] @ self.text_projection
        return x

    def forward(self, image, text):
        image_features = self.encode_image(image)
        text_features = self.encode_text(text)

        # normalized features
        image_features = image_features / image_features.norm(
            dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        # shape = [global_batch_size, global_batch_size]
        return logits_per_image, logits_per_text


 def convert_weights(model: nn.Module):
    """Convert applicable model parameters to fp16"""

    def _convert_weights_to_fp16(ll):
        if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)):
            ll.weight.data = ll.weight.data.half()
            if ll.bias is not None:
                ll.bias.data = ll.bias.data.half()

        if isinstance(ll, nn.MultiheadAttention):
            for attr in [
                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
                    'in_proj_bias', 'bias_k', 'bias_v'
            ]:
                tensor = getattr(ll, attr)
                if tensor is not None:
                    tensor.data = tensor.data.half()

        for name in ['text_projection', 'proj']:
            if hasattr(ll, name):
                attr = getattr(ll, name)
                if attr is not None:
                    attr.data = attr.data.half()

    model.apply(_convert_weights_to_fp16)


 def build_model():
    model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12)
    convert_weights(model)
    return model.eval()
--- a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
+++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
@@ -0,0 +1,156 @@
 """ CLIP
 Adapted from https://github.com/openai/CLIP.
 Originally MIT License, Copyright (c) 2021 OpenAI.
 """

 import gzip
 import html
 import os
 from functools import lru_cache

 import ftfy
 import regex as re


@lru_cache()
 def default_bpe():
    return os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'bpe_simple_vocab_16e6.txt.gz')


@lru_cache()
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord('!'),
                    ord('~') + 1)) + list(range(
                        ord('¡'),
                        ord('¬') + 1)) + list(range(ord('®'),
                                                    ord('ÿ') + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


 def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


 def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


 def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


 class SimpleTokenizer(object):

    def __init__(self, bpe_path: str = default_bpe()):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
        merges = merges[1:49152 - 256 - 2 + 1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v + '</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {
            '<|startoftext|>': '<|startoftext|>',
            '<|endoftext|>': '<|endoftext|>'
        }
        self.pat = re.compile(
            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
            re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + (token[-1] + '</w>', )
        pairs = get_pairs(word)

        if not pairs:
            return token + '</w>'

        while True:
            bigram = min(
                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            error_list = []
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except Exception as err:
                    new_word.extend(word[i:])
                    error_list.append(err)
                    break

                if word[i] == first and i < len(word) - 1 and word[
                        i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b]
                            for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token]
                              for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            'utf-8', errors='replace').replace('</w>', ' ')
        return text
--- a/modelscope/models/cv/tinynas_detection/init.py
+++ b/modelscope/models/cv/tinynas_detection/init.py
@@ -0,0 +1,24 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .tinynas_detector import Tinynas_detector

 else:
    _import_structure = {
        'tinynas_detector': ['TinynasDetector'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/tinynas_detection/backbone/init.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/init.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import copy

 from .darknet import CSPDarknet
 from .tinynas import load_tinynas_net


 def build_backbone(cfg):
    backbone_cfg = copy.deepcopy(cfg)
    name = backbone_cfg.pop('name')
    if name == 'CSPDarknet':
        return CSPDarknet(**backbone_cfg)
    elif name == 'TinyNAS':
        return load_tinynas_net(backbone_cfg)
--- a/modelscope/models/cv/tinynas_detection/backbone/darknet.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
@@ -0,0 +1,126 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import torch
 from torch import nn

 from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
                             SPPBottleneck)


 class CSPDarknet(nn.Module):

    def __init__(
        self,
        dep_mul,
        wid_mul,
        out_features=('dark3', 'dark4', 'dark5'),
        depthwise=False,
        act='silu',
        reparam=False,
    ):
        super(CSPDarknet, self).__init__()
        assert out_features, 'please provide output features of Darknet'
        self.out_features = out_features
        Conv = DWConv if depthwise else BaseConv

        base_channels = int(wid_mul * 64)  # 64
        base_depth = max(round(dep_mul * 3), 1)  # 3

        # stem
        # self.stem = Focus(3, base_channels, ksize=3, act=act)
        self.stem = Focus(3, base_channels, 3, act=act)

        # dark2
        self.dark2 = nn.Sequential(
            Conv(base_channels, base_channels * 2, 3, 2, act=act),
            CSPLayer(
                base_channels * 2,
                base_channels * 2,
                n=base_depth,
                depthwise=depthwise,
                act=act,
                reparam=reparam,
            ),
        )

        # dark3
        self.dark3 = nn.Sequential(
            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
            CSPLayer(
                base_channels * 4,
                base_channels * 4,
                n=base_depth * 3,
                depthwise=depthwise,
                act=act,
                reparam=reparam,
            ),
        )

        # dark4
        self.dark4 = nn.Sequential(
            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
            CSPLayer(
                base_channels * 8,
                base_channels * 8,
                n=base_depth * 3,
                depthwise=depthwise,
                act=act,
                reparam=reparam,
            ),
        )

        # dark5
        self.dark5 = nn.Sequential(
            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
            SPPBottleneck(
                base_channels * 16, base_channels * 16, activation=act),
            CSPLayer(
                base_channels * 16,
                base_channels * 16,
                n=base_depth,
                shortcut=False,
                depthwise=depthwise,
                act=act,
                reparam=reparam,
            ),
        )

    def init_weights(self, pretrain=None):

        if pretrain is None:
            return
        else:
            pretrained_dict = torch.load(
                pretrain, map_location='cpu')['state_dict']
            new_params = self.state_dict().copy()
            for k, v in pretrained_dict.items():
                ks = k.split('.')
                if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[
                        -1] == 'total_params':
                    continue
                else:
                    new_params[k] = v

            self.load_state_dict(new_params)
            print(f' load pretrain backbone from {pretrain}')

    def forward(self, x):
        outputs = {}
        x = self.stem(x)
        outputs['stem'] = x
        x = self.dark2(x)
        outputs['dark2'] = x
        x = self.dark3(x)
        outputs['dark3'] = x
        x = self.dark4(x)
        outputs['dark4'] = x
        x = self.dark5(x)
        outputs['dark5'] = x
        features_out = [
            outputs['stem'], outputs['dark2'], outputs['dark3'],
            outputs['dark4'], outputs['dark5']
        ]

        return features_out
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
@@ -0,0 +1,347 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import torch
 import torch.nn as nn

 from ..core.base_ops import Focus, SPPBottleneck, get_activation
 from ..core.repvgg_block import RepVggBlock


 class ConvKXBN(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride):
        super(ConvKXBN, self).__init__()
        self.conv1 = nn.Conv2d(
            in_c,
            out_c,
            kernel_size,
            stride, (kernel_size - 1) // 2,
            groups=1,
            bias=False)
        self.bn1 = nn.BatchNorm2d(out_c)

    def forward(self, x):
        return self.bn1(self.conv1(x))


 class ConvKXBNRELU(nn.Module):

    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
        super(ConvKXBNRELU, self).__init__()
        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
        if act is None:
            self.activation_function = torch.relu
        else:
            self.activation_function = get_activation(act)

    def forward(self, x):
        output = self.conv(x)
        return self.activation_function(output)


 class ResConvK1KX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 force_resproj=False,
                 act='silu'):
        super(ResConvK1KX, self).__init__()
        self.stride = stride
        self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
        self.conv2 = RepVggBlock(
            btn_c, out_c, kernel_size, stride, act='identity')

        if act is None:
            self.activation_function = torch.relu
        else:
            self.activation_function = get_activation(act)

        if stride == 2:
            self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
        else:
            self.residual_downsample = nn.Identity()

        if in_c != out_c or force_resproj:
            self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
        else:
            self.residual_proj = nn.Identity()

    def forward(self, x):
        if self.stride != 2:
            reslink = self.residual_downsample(x)
            reslink = self.residual_proj(reslink)

        output = x
        output = self.conv1(output)
        output = self.activation_function(output)
        output = self.conv2(output)
        if self.stride != 2:
            output = output + reslink
        output = self.activation_function(output)

        return output


 class SuperResConvK1KX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 num_blocks,
                 with_spp=False,
                 act='silu'):
        super(SuperResConvK1KX, self).__init__()
        if act is None:
            self.act = torch.relu
        else:
            self.act = get_activation(act)
        self.block_list = nn.ModuleList()
        for block_id in range(num_blocks):
            if block_id == 0:
                in_channels = in_c
                out_channels = out_c
                this_stride = stride
                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
                this_kernel_size = kernel_size
            else:
                in_channels = out_c
                out_channels = out_c
                this_stride = 1
                force_resproj = False
                this_kernel_size = kernel_size
            the_block = ResConvK1KX(
                in_channels,
                out_channels,
                btn_c,
                this_kernel_size,
                this_stride,
                force_resproj,
                act=act)
            self.block_list.append(the_block)
            if block_id == 0 and with_spp:
                self.block_list.append(
                    SPPBottleneck(out_channels, out_channels))

    def forward(self, x):
        output = x
        for block in self.block_list:
            output = block(output)
        return output


 class ResConvKXKX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 force_resproj=False,
                 act='silu'):
        super(ResConvKXKX, self).__init__()
        self.stride = stride
        if self.stride == 2:
            self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
        else:
            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
            self.conv2 = RepVggBlock(
                btn_c, out_c, kernel_size, stride, act='identity')

            if act is None:
                self.activation_function = torch.relu
            else:
                self.activation_function = get_activation(act)

            if stride == 2:
                self.residual_downsample = nn.AvgPool2d(
                    kernel_size=2, stride=2)
            else:
                self.residual_downsample = nn.Identity()

            if in_c != out_c or force_resproj:
                self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
            else:
                self.residual_proj = nn.Identity()

    def forward(self, x):
        if self.stride == 2:
            return self.downsampler(x)
        reslink = self.residual_downsample(x)
        reslink = self.residual_proj(reslink)

        output = x
        output = self.conv1(output)
        output = self.activation_function(output)
        output = self.conv2(output)

        output = output + reslink
        output = self.activation_function(output)

        return output


 class SuperResConvKXKX(nn.Module):

    def __init__(self,
                 in_c,
                 out_c,
                 btn_c,
                 kernel_size,
                 stride,
                 num_blocks,
                 with_spp=False,
                 act='silu'):
        super(SuperResConvKXKX, self).__init__()
        if act is None:
            self.act = torch.relu
        else:
            self.act = get_activation(act)
        self.block_list = nn.ModuleList()
        for block_id in range(num_blocks):
            if block_id == 0:
                in_channels = in_c
                out_channels = out_c
                this_stride = stride
                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
                this_kernel_size = kernel_size
            else:
                in_channels = out_c
                out_channels = out_c
                this_stride = 1
                force_resproj = False
                this_kernel_size = kernel_size
            the_block = ResConvKXKX(
                in_channels,
                out_channels,
                btn_c,
                this_kernel_size,
                this_stride,
                force_resproj,
                act=act)
            self.block_list.append(the_block)
            if block_id == 0 and with_spp:
                self.block_list.append(
                    SPPBottleneck(out_channels, out_channels))

    def forward(self, x):
        output = x
        for block in self.block_list:
            output = block(output)
        return output


 class TinyNAS(nn.Module):

    def __init__(self,
                 structure_info=None,
                 out_indices=[0, 1, 2, 4, 5],
                 out_channels=[None, None, 128, 256, 512],
                 with_spp=False,
                 use_focus=False,
                 need_conv1=True,
                 act='silu'):
        super(TinyNAS, self).__init__()
        assert len(out_indices) == len(out_channels)
        self.out_indices = out_indices
        self.need_conv1 = need_conv1

        self.block_list = nn.ModuleList()
        if need_conv1:
            self.conv1_list = nn.ModuleList()
        for idx, block_info in enumerate(structure_info):
            the_block_class = block_info['class']
            if the_block_class == 'ConvKXBNRELU':
                if use_focus:
                    the_block = Focus(block_info['in'], block_info['out'],
                                      block_info['k'])
                else:
                    the_block = ConvKXBNRELU(
                        block_info['in'],
                        block_info['out'],
                        block_info['k'],
                        block_info['s'],
                        act=act)
                self.block_list.append(the_block)
            elif the_block_class == 'SuperResConvK1KX':
                spp = with_spp if idx == len(structure_info) - 1 else False
                the_block = SuperResConvK1KX(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['k'],
                    block_info['s'],
                    block_info['L'],
                    spp,
                    act=act)
                self.block_list.append(the_block)
            elif the_block_class == 'SuperResConvKXKX':
                spp = with_spp if idx == len(structure_info) - 1 else False
                the_block = SuperResConvKXKX(
                    block_info['in'],
                    block_info['out'],
                    block_info['btn'],
                    block_info['k'],
                    block_info['s'],
                    block_info['L'],
                    spp,
                    act=act)
                self.block_list.append(the_block)
            if need_conv1:
                if idx in self.out_indices and out_channels[
                        self.out_indices.index(idx)] is not None:
                    self.conv1_list.append(
                        nn.Conv2d(block_info['out'],
                                  out_channels[self.out_indices.index(idx)],
                                  1))
                else:
                    self.conv1_list.append(None)

    def init_weights(self, pretrain=None):
        pass

    def forward(self, x):
        output = x
        stage_feature_list = []
        for idx, block in enumerate(self.block_list):
            output = block(output)
            if idx in self.out_indices:
                if self.need_conv1 and self.conv1_list[idx] is not None:
                    true_out = self.conv1_list[idx](output)
                    stage_feature_list.append(true_out)
                else:
                    stage_feature_list.append(output)
        return stage_feature_list


 def load_tinynas_net(backbone_cfg):
    # load masternet model to path
    import ast

    struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str])
    struct_info = ast.literal_eval(struct_str)
    for layer in struct_info:
        if 'nbitsA' in layer:
            del layer['nbitsA']
        if 'nbitsW' in layer:
            del layer['nbitsW']

    model = TinyNAS(
        structure_info=struct_info,
        out_indices=backbone_cfg.out_indices,
        out_channels=backbone_cfg.out_channels,
        with_spp=backbone_cfg.with_spp,
        use_focus=backbone_cfg.use_focus,
        act=backbone_cfg.act,
        need_conv1=backbone_cfg.need_conv1,
    )

    return model
--- a/modelscope/models/cv/tinynas_detection/core/init.py
+++ b/modelscope/models/cv/tinynas_detection/core/init.py
@@ -0,0 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
--- a/modelscope/models/cv/tinynas_detection/core/base_ops.py
+++ b/modelscope/models/cv/tinynas_detection/core/base_ops.py
@@ -0,0 +1,474 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .repvgg_block import RepVggBlock


 class SiLU(nn.Module):
    """export-friendly version of nn.SiLU()"""

    @staticmethod
    def forward(x):
        return x * torch.sigmoid(x)


 def get_activation(name='silu', inplace=True):
    if name == 'silu':
        module = nn.SiLU(inplace=inplace)
    elif name == 'relu':
        module = nn.ReLU(inplace=inplace)
    elif name == 'lrelu':
        module = nn.LeakyReLU(0.1, inplace=inplace)
    else:
        raise AttributeError('Unsupported act type: {}'.format(name))
    return module


 def get_norm(name, out_channels, inplace=True):
    if name == 'bn':
        module = nn.BatchNorm2d(out_channels)
    elif name == 'gn':
        module = nn.GroupNorm(num_channels=out_channels, num_groups=32)
    return module


 class BaseConv(nn.Module):
    """A Conv2d -> Batchnorm -> silu/leaky relu block"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize,
                 stride=1,
                 groups=1,
                 bias=False,
                 act='silu',
                 norm='bn'):
        super().__init__()
        # same padding
        pad = (ksize - 1) // 2
        self.conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=ksize,
            stride=stride,
            padding=pad,
            groups=groups,
            bias=bias,
        )
        if norm is not None:
            self.bn = get_norm(norm, out_channels, inplace=True)
        if act is not None:
            self.act = get_activation(act, inplace=True)
        self.with_norm = norm is not None
        self.with_act = act is not None

    def forward(self, x):
        x = self.conv(x)
        if self.with_norm:
            # x = self.norm(x)
            x = self.bn(x)
        if self.with_act:
            x = self.act(x)
        return x

    def fuseforward(self, x):
        return self.act(self.conv(x))


 class DepthWiseConv(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize,
                 stride=1,
                 groups=None,
                 bias=False,
                 act='silu',
                 norm='bn'):
        super().__init__()
        padding = (ksize - 1) // 2
        self.depthwise = nn.Conv2d(
            in_channels,
            in_channels,
            kernel_size=ksize,
            stride=stride,
            padding=padding,
            groups=in_channels,
            bias=bias,
        )

        self.pointwise = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias)
        if norm is not None:
            self.dwnorm = get_norm(norm, in_channels, inplace=True)
            self.pwnorm = get_norm(norm, out_channels, inplace=True)
        if act is not None:
            self.act = get_activation(act, inplace=True)

        self.with_norm = norm is not None
        self.with_act = act is not None
        self.order = ['depthwise', 'dwnorm', 'pointwise', 'act']

    def forward(self, x):

        for layer_name in self.order:
            layer = self.__getattr__(layer_name)
            if layer is not None:
                x = layer(x)
        return x


 class DWConv(nn.Module):
    """Depthwise Conv + Conv"""

    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
        super().__init__()
        self.dconv = BaseConv(
            in_channels,
            in_channels,
            ksize=ksize,
            stride=stride,
            groups=in_channels,
            act=act,
        )
        self.pconv = BaseConv(
            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)

    def forward(self, x):
        x = self.dconv(x)
        return self.pconv(x)


 class Bottleneck(nn.Module):
    # Standard bottleneck
    def __init__(
        self,
        in_channels,
        out_channels,
        shortcut=True,
        expansion=0.5,
        depthwise=False,
        act='silu',
        reparam=False,
    ):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        Conv = DWConv if depthwise else BaseConv
        k_conv1 = 3 if reparam else 1
        self.conv1 = BaseConv(
            in_channels, hidden_channels, k_conv1, stride=1, act=act)
        if reparam:
            self.conv2 = RepVggBlock(
                hidden_channels, out_channels, 3, stride=1, act=act)
        else:
            self.conv2 = Conv(
                hidden_channels, out_channels, 3, stride=1, act=act)
        self.use_add = shortcut and in_channels == out_channels

    def forward(self, x):
        y = self.conv2(self.conv1(x))
        if self.use_add:
            y = y + x
        return y


 class ResLayer(nn.Module):
    'Residual layer with `in_channels` inputs.'

    def __init__(self, in_channels: int):
        super().__init__()
        mid_channels = in_channels // 2
        self.layer1 = BaseConv(
            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
        self.layer2 = BaseConv(
            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')

    def forward(self, x):
        out = self.layer2(self.layer1(x))
        return x + out


 class SPPBottleneck(nn.Module):
    """Spatial pyramid pooling layer used in YOLOv3-SPP"""

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_sizes=(5, 9, 13),
                 activation='silu'):
        super().__init__()
        hidden_channels = in_channels // 2
        self.conv1 = BaseConv(
            in_channels, hidden_channels, 1, stride=1, act=activation)
        self.m = nn.ModuleList([
            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
            for ks in kernel_sizes
        ])
        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
        self.conv2 = BaseConv(
            conv2_channels, out_channels, 1, stride=1, act=activation)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
        x = self.conv2(x)
        return x


 class CSPLayer(nn.Module):
    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""

    def __init__(
        self,
        in_channels,
        out_channels,
        n=1,
        shortcut=True,
        expansion=0.5,
        depthwise=False,
        act='silu',
        reparam=False,
    ):
        """
        Args:
            in_channels (int): input channels.
            out_channels (int): output channels.
            n (int): number of Bottlenecks. Default value: 1.
        """
        # ch_in, ch_out, number, shortcut, groups, expansion
        super().__init__()
        hidden_channels = int(out_channels * expansion)  # hidden channels
        self.conv1 = BaseConv(
            in_channels, hidden_channels, 1, stride=1, act=act)
        self.conv2 = BaseConv(
            in_channels, hidden_channels, 1, stride=1, act=act)
        self.conv3 = BaseConv(
            2 * hidden_channels, out_channels, 1, stride=1, act=act)
        module_list = [
            Bottleneck(
                hidden_channels,
                hidden_channels,
                shortcut,
                1.0,
                depthwise,
                act=act,
                reparam=reparam) for _ in range(n)
        ]
        self.m = nn.Sequential(*module_list)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_2 = self.conv2(x)
        x_1 = self.m(x_1)
        x = torch.cat((x_1, x_2), dim=1)
        return self.conv3(x)


 class Focus(nn.Module):
    """Focus width and height information into channel space."""

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize=1,
                 stride=1,
                 act='silu'):
        super().__init__()
        self.conv = BaseConv(
            in_channels * 4, out_channels, ksize, stride, act=act)

    def forward(self, x):
        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
        patch_top_left = x[..., ::2, ::2]
        patch_top_right = x[..., ::2, 1::2]
        patch_bot_left = x[..., 1::2, ::2]
        patch_bot_right = x[..., 1::2, 1::2]
        x = torch.cat(
            (
                patch_top_left,
                patch_bot_left,
                patch_top_right,
                patch_bot_right,
            ),
            dim=1,
        )
        return self.conv(x)


 class fast_Focus(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize=1,
                 stride=1,
                 act='silu'):
        super(Focus, self).__init__()
        self.conv1 = self.focus_conv(w1=1.0)
        self.conv2 = self.focus_conv(w3=1.0)
        self.conv3 = self.focus_conv(w2=1.0)
        self.conv4 = self.focus_conv(w4=1.0)

        self.conv = BaseConv(
            in_channels * 4, out_channels, ksize, stride, act=act)

    def forward(self, x):
        return self.conv(
            torch.cat(
                [self.conv1(x),
                 self.conv2(x),
                 self.conv3(x),
                 self.conv4(x)], 1))

    def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
        conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False)
        conv.weight = self.init_weights_constant(w1, w2, w3, w4)
        conv.weight.requires_grad = False
        return conv

    def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
        return nn.Parameter(
            torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]],
                          [[[w1, w2], [w3, w4]]]]))


 # shufflenet block
 def channel_shuffle(x, groups=2):
    bat_size, channels, w, h = x.shape
    group_c = channels // groups
    x = x.view(bat_size, groups, group_c, w, h)
    x = torch.transpose(x, 1, 2).contiguous()
    x = x.view(bat_size, -1, w, h)
    return x


 def conv_1x1_bn(in_c, out_c, stride=1):
    return nn.Sequential(
        nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False),
        nn.BatchNorm2d(out_c), nn.ReLU(True))


 def conv_bn(in_c, out_c, stride=2):
    return nn.Sequential(
        nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False),
        nn.BatchNorm2d(out_c), nn.ReLU(True))


 class ShuffleBlock(nn.Module):

    def __init__(self, in_c, out_c, downsample=False):
        super(ShuffleBlock, self).__init__()
        self.downsample = downsample
        half_c = out_c // 2
        if downsample:
            self.branch1 = nn.Sequential(
                # 3*3 dw conv, stride = 2
                # nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False),
                nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False),
                nn.BatchNorm2d(in_c),
                # 1*1 pw conv
                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
                nn.BatchNorm2d(half_c),
                nn.ReLU(True))

            self.branch2 = nn.Sequential(
                # 1*1 pw conv
                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
                nn.BatchNorm2d(half_c),
                nn.ReLU(True),
                # 3*3 dw conv, stride = 2
                # nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False),
                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
                nn.BatchNorm2d(half_c),
                # 1*1 pw conv
                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
                nn.BatchNorm2d(half_c),
                nn.ReLU(True))
        else:
            # in_c = out_c
            assert in_c == out_c

            self.branch2 = nn.Sequential(
                # 1*1 pw conv
                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
                nn.BatchNorm2d(half_c),
                nn.ReLU(True),
                # 3*3 dw conv, stride = 1
                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
                nn.BatchNorm2d(half_c),
                # 1*1 pw conv
                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
                nn.BatchNorm2d(half_c),
                nn.ReLU(True))

    def forward(self, x):
        out = None
        if self.downsample:
            # if it is downsampling, we don't need to do channel split
            out = torch.cat((self.branch1(x), self.branch2(x)), 1)
        else:
            # channel split
            channels = x.shape[1]
            c = channels // 2
            x1 = x[:, :c, :, :]
            x2 = x[:, c:, :, :]
            out = torch.cat((x1, self.branch2(x2)), 1)
        return channel_shuffle(out, 2)


 class ShuffleCSPLayer(nn.Module):
    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""

    def __init__(
        self,
        in_channels,
        out_channels,
        n=1,
        shortcut=True,
        expansion=0.5,
        depthwise=False,
        act='silu',
    ):
        """
        Args:
            in_channels (int): input channels.
            out_channels (int): output channels.
            n (int): number of Bottlenecks. Default value: 1.
        """
        # ch_in, ch_out, number, shortcut, groups, expansion
        super().__init__()
        hidden_channels = int(out_channels * expansion)  # hidden channels
        self.conv1 = BaseConv(
            in_channels, hidden_channels, 1, stride=1, act=act)
        self.conv2 = BaseConv(
            in_channels, hidden_channels, 1, stride=1, act=act)
        module_list = [
            Bottleneck(
                hidden_channels,
                hidden_channels,
                shortcut,
                1.0,
                depthwise,
                act=act) for _ in range(n)
        ]
        self.m = nn.Sequential(*module_list)

    def forward(self, x):
        x_1 = self.conv1(x)
        x_2 = self.conv2(x)
        x_1 = self.m(x_1)
        x = torch.cat((x_1, x_2), dim=1)
        # add channel shuffle
        return channel_shuffle(x, 2)
--- a/modelscope/models/cv/tinynas_detection/core/neck_ops.py
+++ b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
@@ -0,0 +1,324 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class Swish(nn.Module):

    def __init__(self, inplace=True):
        super(Swish, self).__init__()
        self.inplace = inplace

    def forward(self, x):
        if self.inplace:
            x.mul_(F.sigmoid(x))
            return x
        else:
            return x * F.sigmoid(x)


 def get_activation(name='silu', inplace=True):
    if name is None:
        return nn.Identity()

    if isinstance(name, str):
        if name == 'silu':
            module = nn.SiLU(inplace=inplace)
        elif name == 'relu':
            module = nn.ReLU(inplace=inplace)
        elif name == 'lrelu':
            module = nn.LeakyReLU(0.1, inplace=inplace)
        elif name == 'swish':
            module = Swish(inplace=inplace)
        elif name == 'hardsigmoid':
            module = nn.Hardsigmoid(inplace=inplace)
        else:
            raise AttributeError('Unsupported act type: {}'.format(name))
        return module
    elif isinstance(name, nn.Module):
        return name
    else:
        raise AttributeError('Unsupported act type: {}'.format(name))


 class ConvBNLayer(nn.Module):

    def __init__(self,
                 ch_in,
                 ch_out,
                 filter_size=3,
                 stride=1,
                 groups=1,
                 padding=0,
                 act=None):
        super(ConvBNLayer, self).__init__()
        self.conv = nn.Conv2d(
            in_channels=ch_in,
            out_channels=ch_out,
            kernel_size=filter_size,
            stride=stride,
            padding=padding,
            groups=groups,
            bias=False)
        self.bn = nn.BatchNorm2d(ch_out, )
        self.act = get_activation(act, inplace=True)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)

        return x


 class RepVGGBlock(nn.Module):

    def __init__(self, ch_in, ch_out, act='relu', deploy=False):
        super(RepVGGBlock, self).__init__()
        self.ch_in = ch_in
        self.ch_out = ch_out
        self.deploy = deploy
        self.in_channels = ch_in
        self.groups = 1
        if self.deploy is False:
            self.rbr_dense = ConvBNLayer(
                ch_in, ch_out, 3, stride=1, padding=1, act=None)
            self.rbr_1x1 = ConvBNLayer(
                ch_in, ch_out, 1, stride=1, padding=0, act=None)
            # self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None
            self.rbr_identity = None
        else:
            self.rbr_reparam = nn.Conv2d(
                in_channels=self.ch_in,
                out_channels=self.ch_out,
                kernel_size=3,
                stride=1,
                padding=1,
                groups=1)
        self.act = get_activation(act) if act is None or isinstance(
            act, (str, dict)) else act

    def forward(self, x):
        if self.deploy:
            print('----------deploy----------')
            y = self.rbr_reparam(x)
        else:
            if self.rbr_identity is None:
                y = self.rbr_dense(x) + self.rbr_1x1(x)
            else:
                y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x)

        y = self.act(y)
        return y

    def switch_to_deploy(self):
        print('switch')
        if not hasattr(self, 'rbr_reparam'):
            # return
            self.rbr_reparam = nn.Conv2d(
                in_channels=self.ch_in,
                out_channels=self.ch_out,
                kernel_size=3,
                stride=1,
                padding=1,
                groups=1)
        print('switch')
        kernel, bias = self.get_equivalent_kernel_bias()
        self.rbr_reparam.weight.data = kernel
        self.rbr_reparam.bias.data = bias
        for para in self.parameters():
            para.detach_()
        # self.__delattr__(self.rbr_dense)
        # self.__delattr__(self.rbr_1x1)
        self.__delattr__('rbr_dense')
        self.__delattr__('rbr_1x1')
        if hasattr(self, 'rbr_identity'):
            self.__delattr__('rbr_identity')
        if hasattr(self, 'id_tensor'):
            self.__delattr__('id_tensor')
        self.deploy = True

    def get_equivalent_kernel_bias(self):
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        if kernel1x1 is None:
            return 0
        else:
            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

    def _fuse_bn_tensor(self, branch):
        if branch is None:
            return 0, 0
        # if isinstance(branch, nn.Sequential):
        if isinstance(branch, ConvBNLayer):
            kernel = branch.conv.weight
            running_mean = branch.bn.running_mean
            running_var = branch.bn.running_var
            gamma = branch.bn.weight
            beta = branch.bn.bias
            eps = branch.bn.eps
        else:
            assert isinstance(branch, nn.BatchNorm2d)
            if not hasattr(self, 'id_tensor'):
                input_dim = self.in_channels // self.groups
                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
                                        dtype=np.float32)
                for i in range(self.in_channels):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = torch.from_numpy(kernel_value).to(
                    branch.weight.device)
            kernel = self.id_tensor
            running_mean = branch.running_mean
            running_var = branch.running_var
            gamma = branch.weight
            beta = branch.bias
            eps = branch.eps
        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape(-1, 1, 1, 1)
        return kernel * t, beta - running_mean * gamma / std


 class BasicBlock(nn.Module):

    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
        super(BasicBlock, self).__init__()
        assert ch_in == ch_out
        # self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
        self.shortcut = shortcut

    def forward(self, x):
        # y = self.conv1(x)
        y = self.conv2(x)
        if self.shortcut:
            return x + y
        else:
            return y


 class BasicBlock_3x3(nn.Module):

    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
        super(BasicBlock_3x3, self).__init__()
        assert ch_in == ch_out
        self.conv1 = ConvBNLayer(
            ch_in, ch_out, 3, stride=1, padding=1, act=act)
        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
        self.shortcut = shortcut

    def forward(self, x):
        y = self.conv1(x)
        y = self.conv2(y)
        if self.shortcut:
            return x + y
        else:
            return y


 class BasicBlock_3x3_Reverse(nn.Module):

    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
        super(BasicBlock_3x3_Reverse, self).__init__()
        assert ch_in == ch_out
        self.conv1 = ConvBNLayer(
            ch_in, ch_out, 3, stride=1, padding=1, act=act)
        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
        self.shortcut = shortcut

    def forward(self, x):
        y = self.conv2(x)
        y = self.conv1(y)
        if self.shortcut:
            return x + y
        else:
            return y


 class SPP(nn.Module):

    def __init__(
        self,
        ch_in,
        ch_out,
        k,
        pool_size,
        act='swish',
    ):
        super(SPP, self).__init__()
        self.pool = []
        for i, size in enumerate(pool_size):
            pool = nn.MaxPool2d(
                kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
            self.add_module('pool{}'.format(i), pool)
            self.pool.append(pool)
        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)

    def forward(self, x):
        outs = [x]

        for pool in self.pool:
            outs.append(pool(x))
        y = torch.cat(outs, axis=1)

        y = self.conv(y)
        return y


 class CSPStage(nn.Module):

    def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False):
        super(CSPStage, self).__init__()

        ch_mid = int(ch_out // 2)
        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
        # self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act)
        self.convs = nn.Sequential()

        next_ch_in = ch_mid
        for i in range(n):
            if block_fn == 'BasicBlock':
                self.convs.add_module(
                    str(i),
                    BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False))
            elif block_fn == 'BasicBlock_3x3':
                self.convs.add_module(
                    str(i),
                    BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True))
            elif block_fn == 'BasicBlock_3x3_Reverse':
                self.convs.add_module(
                    str(i),
                    BasicBlock_3x3_Reverse(
                        next_ch_in, ch_mid, act=act, shortcut=True))
            else:
                raise NotImplementedError
            if i == (n - 1) // 2 and spp:
                self.convs.add_module(
                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
            next_ch_in = ch_mid
        # self.convs = nn.Sequential(*convs)
        self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act)

    def forward(self, x):
        y1 = self.conv1(x)
        y2 = self.conv2(x)

        mid_out = [y1]
        for conv in self.convs:
            y2 = conv(y2)
            mid_out.append(y2)
        y = torch.cat(mid_out, axis=1)
        y = self.conv3(y)
        return y
--- a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
+++ b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
@@ -0,0 +1,205 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.nn.init as init
 from torch.nn.parameter import Parameter


 def get_activation(name='silu', inplace=True):
    if name == 'silu':
        module = nn.SiLU(inplace=inplace)
    elif name == 'relu':
        module = nn.ReLU(inplace=inplace)
    elif name == 'lrelu':
        module = nn.LeakyReLU(0.1, inplace=inplace)
    elif name == 'identity':
        module = nn.Identity()
    else:
        raise AttributeError('Unsupported act type: {}'.format(name))
    return module


 def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
    '''Basic cell for rep-style block, including conv and bn'''
    result = nn.Sequential()
    result.add_module(
        'conv',
        nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=groups,
            bias=False))
    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
    return result


 class RepVggBlock(nn.Module):
    '''RepVggBlock is a basic rep-style block, including training and deploy status
    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
    '''

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 stride=1,
                 padding=1,
                 dilation=1,
                 groups=1,
                 padding_mode='zeros',
                 deploy=False,
                 use_se=False,
                 act='relu',
                 norm=None):
        super(RepVggBlock, self).__init__()
        """ Initialization of the class.
        Args:
            in_channels (int): Number of channels in the input image
            out_channels (int): Number of channels produced by the convolution
            kernel_size (int or tuple): Size of the convolving kernel
            stride (int or tuple, optional): Stride of the convolution. Default: 1
            padding (int or tuple, optional): Zero-padding added to both sides of
                the input. Default: 1
            dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
            groups (int, optional): Number of blocked connections from input
                channels to output channels. Default: 1
            padding_mode (string, optional): Default: 'zeros'
            deploy: Whether to be deploy status or training status. Default: False
            use_se: Whether to use se. Default: False
        """
        self.deploy = deploy
        self.groups = groups
        self.in_channels = in_channels
        self.out_channels = out_channels

        assert kernel_size == 3
        assert padding == 1

        padding_11 = padding - kernel_size // 2

        if isinstance(act, str):
            self.nonlinearity = get_activation(act)
        else:
            self.nonlinearity = act

        if use_se:
            raise NotImplementedError('se block not supported yet')
        else:
            self.se = nn.Identity()

        if deploy:
            self.rbr_reparam = nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
                bias=True,
                padding_mode=padding_mode)

        else:
            self.rbr_identity = None
            self.rbr_dense = conv_bn(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                stride=stride,
                padding=padding,
                groups=groups)
            self.rbr_1x1 = conv_bn(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
                stride=stride,
                padding=padding_11,
                groups=groups)

    def forward(self, inputs):
        '''Forward process'''
        if hasattr(self, 'rbr_reparam'):
            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))

        if self.rbr_identity is None:
            id_out = 0
        else:
            id_out = self.rbr_identity(inputs)

        return self.nonlinearity(
            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))

    def get_equivalent_kernel_bias(self):
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        if kernel1x1 is None:
            return 0
        else:
            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])

    def _fuse_bn_tensor(self, branch):
        if branch is None:
            return 0, 0
        if isinstance(branch, nn.Sequential):
            kernel = branch.conv.weight
            running_mean = branch.bn.running_mean
            running_var = branch.bn.running_var
            gamma = branch.bn.weight
            beta = branch.bn.bias
            eps = branch.bn.eps
        else:
            assert isinstance(branch, nn.BatchNorm2d)
            if not hasattr(self, 'id_tensor'):
                input_dim = self.in_channels // self.groups
                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
                                        dtype=np.float32)
                for i in range(self.in_channels):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = torch.from_numpy(kernel_value).to(
                    branch.weight.device)
            kernel = self.id_tensor
            running_mean = branch.running_mean
            running_var = branch.running_var
            gamma = branch.weight
            beta = branch.bias
            eps = branch.eps
        std = (running_var + eps).sqrt()
        t = (gamma / std).reshape(-1, 1, 1, 1)
        return kernel * t, beta - running_mean * gamma / std

    def switch_to_deploy(self):
        if hasattr(self, 'rbr_reparam'):
            return
        kernel, bias = self.get_equivalent_kernel_bias()
        self.rbr_reparam = nn.Conv2d(
            in_channels=self.rbr_dense.conv.in_channels,
            out_channels=self.rbr_dense.conv.out_channels,
            kernel_size=self.rbr_dense.conv.kernel_size,
            stride=self.rbr_dense.conv.stride,
            padding=self.rbr_dense.conv.padding,
            dilation=self.rbr_dense.conv.dilation,
            groups=self.rbr_dense.conv.groups,
            bias=True)
        self.rbr_reparam.weight.data = kernel
        self.rbr_reparam.bias.data = bias
        for para in self.parameters():
            para.detach_()
        self.__delattr__('rbr_dense')
        self.__delattr__('rbr_1x1')
        if hasattr(self, 'rbr_identity'):
            self.__delattr__('rbr_identity')
        if hasattr(self, 'id_tensor'):
            self.__delattr__('id_tensor')
        self.deploy = True
--- a/modelscope/models/cv/tinynas_detection/core/utils.py
+++ b/modelscope/models/cv/tinynas_detection/core/utils.py
@@ -0,0 +1,196 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import numpy as np
 import torch
 import torchvision

 __all__ = [
    'filter_box',
    'postprocess_airdet',
    'bboxes_iou',
    'matrix_iou',
    'adjust_box_anns',
    'xyxy2xywh',
    'xyxy2cxcywh',
 ]


 def multiclass_nms(multi_bboxes,
                   multi_scores,
                   score_thr,
                   iou_thr,
                   max_num=100,
                   score_factors=None):
    """NMS for multi-class bboxes.

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_scores (Tensor): shape (n, #class), where the last column
            contains scores of the background class, but this will be ignored.
        score_thr (float): bbox threshold, bboxes with scores lower than it
            will not be considered.
        nms_thr (float): NMS IoU threshold
        max_num (int): if there are more than max_num bboxes after NMS,
            only top max_num will be kept.
        score_factors (Tensor): The factors multiplied to scores before
            applying NMS

    Returns:
        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
            are 0-based.
    """
    num_classes = multi_scores.size(1)
    # exclude background category
    if multi_bboxes.shape[1] > 4:
        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
    else:
        bboxes = multi_bboxes[:, None].expand(
            multi_scores.size(0), num_classes, 4)
    scores = multi_scores
    # filter out boxes with low scores
    valid_mask = scores > score_thr  # 1000 * 80 bool

    # We use masked_select for ONNX exporting purpose,
    # which is equivalent to bboxes = bboxes[valid_mask]
    # (TODO): as ONNX does not support repeat now,
    # we have to use this ugly code
    # bboxes -> 1000, 4
    bboxes = torch.masked_select(
        bboxes,
        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
    if score_factors is not None:
        scores = scores * score_factors[:, None]
    scores = torch.masked_select(scores, valid_mask)
    labels = valid_mask.nonzero(as_tuple=False)[:, 1]

    if bboxes.numel() == 0:
        bboxes = multi_bboxes.new_zeros((0, 5))
        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
        scores = multi_bboxes.new_zeros((0, ))

        return bboxes, scores, labels

    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)

    if max_num > 0:
        keep = keep[:max_num]

    return bboxes[keep], scores[keep], labels[keep]


 def filter_box(output, scale_range):
    """
    output: (N, 5+class) shape
    """
    min_scale, max_scale = scale_range
    w = output[:, 2] - output[:, 0]
    h = output[:, 3] - output[:, 1]
    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
    return output[keep]


 def filter_results(boxlist, num_classes, nms_thre):
    boxes = boxlist.bbox
    scores = boxlist.get_field('scores')
    cls = boxlist.get_field('labels')
    nms_out_index = torchvision.ops.batched_nms(
        boxes,
        scores,
        cls,
        nms_thre,
    )
    boxlist = boxlist[nms_out_index]

    return boxlist


 def postprocess_airdet(prediction,
                       num_classes,
                       conf_thre=0.7,
                       nms_thre=0.45,
                       imgs=None):
    box_corner = prediction.new(prediction.shape)
    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
    prediction[:, :, :4] = box_corner[:, :, :4]
    output = [None for _ in range(len(prediction))]
    for i, image_pred in enumerate(prediction):
        # If none are remaining => process next image
        if not image_pred.size(0):
            continue
        multi_bboxes = image_pred[:, :4]
        multi_scores = image_pred[:, 5:]
        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
                                                    conf_thre, nms_thre, 500)
        detections = torch.cat(
            (detections, scores[:, None], scores[:, None], labels[:, None]),
            dim=1)

        if output[i] is None:
            output[i] = detections
        else:
            output[i] = torch.cat((output[i], detections))
    return output


 def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
        raise IndexError

    if xyxy:
        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
    else:
        tl = torch.max(
            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
        )
        br = torch.min(
            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
        )

        area_a = torch.prod(bboxes_a[:, 2:], 1)
        area_b = torch.prod(bboxes_b[:, 2:], 1)
    en = (tl < br).type(tl.type()).prod(dim=2)
    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
    return area_i / (area_a[:, None] + area_b - area_i)


 def matrix_iou(a, b):
    """
    return iou of a and b, numpy version for data augenmentation
    """
    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])

    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)


 def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
    return bbox


 def xyxy2xywh(bboxes):
    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
    return bboxes


 def xyxy2cxcywh(bboxes):
    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
    return bboxes
--- a/modelscope/models/cv/tinynas_detection/detector.py
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -0,0 +1,181 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import os.path as osp
 import pickle

 import cv2
 import torch
 import torchvision

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from .backbone import build_backbone
 from .head import build_head
 from .neck import build_neck
 from .utils import parse_config


 class SingleStageDetector(TorchModel):
    """
    The base class of single stage detector.
    """

    def __init__(self, model_dir: str, *args, **kwargs):
        """
        init model by cfg
        """
        super().__init__(model_dir, *args, **kwargs)

        config_path = osp.join(model_dir, 'airdet_s.py')
        config = parse_config(config_path)
        self.cfg = config
        model_path = osp.join(model_dir, config.model.name)
        label_map = osp.join(model_dir, config.model.class_map)
        self.label_map = pickle.load(open(label_map, 'rb'))
        self.size_divisible = config.dataset.size_divisibility
        self.num_classes = config.model.head.num_classes
        self.conf_thre = config.model.head.nms_conf_thre
        self.nms_thre = config.model.head.nms_iou_thre

        self.backbone = build_backbone(self.cfg.model.backbone)
        self.neck = build_neck(self.cfg.model.neck)
        self.head = build_head(self.cfg.model.head)

        self.load_pretrain_model(model_path)

    def load_pretrain_model(self, pretrain_model):

        state_dict = torch.load(pretrain_model, map_location='cpu')['model']
        new_state_dict = {}
        for k, v in state_dict.items():
            k = k.replace('module.', '')
            new_state_dict[k] = v
        self.load_state_dict(new_state_dict, strict=True)

    def inference(self, x):

        if self.training:
            return self.forward_train(x)
        else:
            return self.forward_eval(x)

    def forward_train(self, x):

        pass

    def forward_eval(self, x):

        x = self.backbone(x)
        x = self.neck(x)
        prediction = self.head(x)

        return prediction

    def preprocess(self, image):
        image = torch.from_numpy(image).type(torch.float32)
        image = image.permute(2, 0, 1)
        shape = image.shape  # c, h, w
        if self.size_divisible > 0:
            import math
            stride = self.size_divisible
            shape = list(shape)
            shape[1] = int(math.ceil(shape[1] / stride) * stride)
            shape[2] = int(math.ceil(shape[2] / stride) * stride)
            shape = tuple(shape)
        pad_img = image.new(*shape).zero_()
        pad_img[:, :image.shape[1], :image.shape[2]].copy_(image)
        pad_img = pad_img.unsqueeze(0)

        return pad_img

    def postprocess(self, preds):
        bboxes, scores, labels_idx = postprocess_gfocal(
            preds, self.num_classes, self.conf_thre, self.nms_thre)
        bboxes = bboxes.cpu().numpy()
        scores = scores.cpu().numpy()
        labels_idx = labels_idx.cpu().numpy()
        labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx]

        return (bboxes, scores, labels)


 def multiclass_nms(multi_bboxes,
                   multi_scores,
                   score_thr,
                   iou_thr,
                   max_num=100,
                   score_factors=None):
    """NMS for multi-class bboxes.

    Args:
        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
        multi_scores (Tensor): shape (n, #class), where the last column
            contains scores of the background class, but this will be ignored.
        score_thr (float): bbox threshold, bboxes with scores lower than it
            will not be considered.
        nms_thr (float): NMS IoU threshold
        max_num (int): if there are more than max_num bboxes after NMS,
            only top max_num will be kept.
        score_factors (Tensor): The factors multiplied to scores before
            applying NMS

    Returns:
        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
            are 0-based.
    """
    num_classes = multi_scores.size(1)
    # exclude background category
    if multi_bboxes.shape[1] > 4:
        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
    else:
        bboxes = multi_bboxes[:, None].expand(
            multi_scores.size(0), num_classes, 4)
    scores = multi_scores
    # filter out boxes with low scores
    valid_mask = scores > score_thr  # 1000 * 80 bool

    # We use masked_select for ONNX exporting purpose,
    # which is equivalent to bboxes = bboxes[valid_mask]
    # (TODO): as ONNX does not support repeat now,
    # we have to use this ugly code
    # bboxes -> 1000, 4
    bboxes = torch.masked_select(
        bboxes,
        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
    if score_factors is not None:
        scores = scores * score_factors[:, None]
    scores = torch.masked_select(scores, valid_mask)
    labels = valid_mask.nonzero(as_tuple=False)[:, 1]

    if bboxes.numel() == 0:
        bboxes = multi_bboxes.new_zeros((0, 5))
        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
        scores = multi_bboxes.new_zeros((0, ))

        return bboxes, scores, labels

    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)

    if max_num > 0:
        keep = keep[:max_num]

    return bboxes[keep], scores[keep], labels[keep]


 def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7):
    assert prediction.shape[0] == 1
    for i, image_pred in enumerate(prediction):
        # If none are remaining => process next image
        if not image_pred.size(0):
            continue
        multi_bboxes = image_pred[:, :4]
        multi_scores = image_pred[:, 4:]
        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
                                                    conf_thre, nms_thre, 500)

    return detections, scores, labels
--- a/modelscope/models/cv/tinynas_detection/head/init.py
+++ b/modelscope/models/cv/tinynas_detection/head/init.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import copy

 from .gfocal_v2_tiny import GFocalHead_Tiny


 def build_head(cfg):

    head_cfg = copy.deepcopy(cfg)
    name = head_cfg.pop('name')
    if name == 'GFocalV2':
        return GFocalHead_Tiny(**head_cfg)
    else:
        raise NotImplementedError
--- a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
+++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
@@ -0,0 +1,361 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import functools
 from functools import partial

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from ..core.base_ops import BaseConv, DWConv


 class Scale(nn.Module):

    def __init__(self, scale=1.0):
        super(Scale, self).__init__()
        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))

    def forward(self, x):
        return x * self.scale


 def multi_apply(func, *args, **kwargs):

    pfunc = partial(func, **kwargs) if kwargs else func
    map_results = map(pfunc, *args)
    return tuple(map(list, zip(*map_results)))


 def xyxy2CxCywh(xyxy, size=None):
    x1 = xyxy[..., 0]
    y1 = xyxy[..., 1]
    x2 = xyxy[..., 2]
    y2 = xyxy[..., 3]

    cx = (x1 + x2) / 2
    cy = (y1 + y2) / 2

    w = x2 - x1
    h = y2 - y1
    if size is not None:
        w = w.clamp(min=0, max=size[1])
        h = h.clamp(min=0, max=size[0])
    return torch.stack([cx, cy, w, h], axis=-1)


 def distance2bbox(points, distance, max_shape=None):
    """Decode distance prediction to bounding box.
    """
    x1 = points[..., 0] - distance[..., 0]
    y1 = points[..., 1] - distance[..., 1]
    x2 = points[..., 0] + distance[..., 2]
    y2 = points[..., 1] + distance[..., 3]
    if max_shape is not None:
        x1 = x1.clamp(min=0, max=max_shape[1])
        y1 = y1.clamp(min=0, max=max_shape[0])
        x2 = x2.clamp(min=0, max=max_shape[1])
        y2 = y2.clamp(min=0, max=max_shape[0])
    return torch.stack([x1, y1, x2, y2], -1)


 def bbox2distance(points, bbox, max_dis=None, eps=0.1):
    """Decode bounding box based on distances.
    """
    left = points[:, 0] - bbox[:, 0]
    top = points[:, 1] - bbox[:, 1]
    right = bbox[:, 2] - points[:, 0]
    bottom = bbox[:, 3] - points[:, 1]
    if max_dis is not None:
        left = left.clamp(min=0, max=max_dis - eps)
        top = top.clamp(min=0, max=max_dis - eps)
        right = right.clamp(min=0, max=max_dis - eps)
        bottom = bottom.clamp(min=0, max=max_dis - eps)
    return torch.stack([left, top, right, bottom], -1)


 class Integral(nn.Module):
    """A fixed layer for calculating integral result from distribution.
    """

    def __init__(self, reg_max=16):
        super(Integral, self).__init__()
        self.reg_max = reg_max
        self.register_buffer('project',
                             torch.linspace(0, self.reg_max, self.reg_max + 1))

    def forward(self, x):
        """Forward feature from the regression head to get integral result of
        bounding box location.
        """
        shape = x.size()
        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
        b, nb, ne, _ = x.size()
        x = x.reshape(b * nb * ne, self.reg_max + 1)
        y = self.project.type_as(x).unsqueeze(1)
        x = torch.matmul(x, y).reshape(b, nb, 4)
        return x


 class GFocalHead_Tiny(nn.Module):
    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
    Estimation for Dense Object Detection.
    """

    def __init__(
            self,
            num_classes,
            in_channels,
            stacked_convs=4,  # 4
            feat_channels=256,
            reg_max=12,
            reg_topk=4,
            reg_channels=64,
            strides=[8, 16, 32],
            add_mean=True,
            norm='gn',
            act='relu',
            start_kernel_size=3,
            conv_groups=1,
            conv_type='BaseConv',
            simOTA_cls_weight=1.0,
            simOTA_iou_weight=3.0,
            octbase=8,
            simlqe=False,
            **kwargs):
        self.simlqe = simlqe
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.strides = strides
        self.feat_channels = feat_channels if isinstance(feat_channels, list) \
            else [feat_channels] * len(self.strides)

        self.cls_out_channels = num_classes + 1  # add 1 for keep consistance with former models
        # and will be deprecated in future.
        self.stacked_convs = stacked_convs
        self.conv_groups = conv_groups
        self.reg_max = reg_max
        self.reg_topk = reg_topk
        self.reg_channels = reg_channels
        self.add_mean = add_mean
        self.total_dim = reg_topk
        self.start_kernel_size = start_kernel_size

        self.norm = norm
        self.act = act
        self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv

        if add_mean:
            self.total_dim += 1

        super(GFocalHead_Tiny, self).__init__()
        self.integral = Integral(self.reg_max)

        self._init_layers()

    def _build_not_shared_convs(self, in_channel, feat_channels):
        self.relu = nn.ReLU(inplace=True)
        cls_convs = nn.ModuleList()
        reg_convs = nn.ModuleList()

        for i in range(self.stacked_convs):
            chn = feat_channels if i > 0 else in_channel
            kernel_size = 3 if i > 0 else self.start_kernel_size
            cls_convs.append(
                self.conv_module(
                    chn,
                    feat_channels,
                    kernel_size,
                    stride=1,
                    groups=self.conv_groups,
                    norm=self.norm,
                    act=self.act))
            reg_convs.append(
                self.conv_module(
                    chn,
                    feat_channels,
                    kernel_size,
                    stride=1,
                    groups=self.conv_groups,
                    norm=self.norm,
                    act=self.act))
        if not self.simlqe:
            conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)]
        else:
            conf_vector = [
                nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
            ]
        conf_vector += [self.relu]
        conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
        reg_conf = nn.Sequential(*conf_vector)

        return cls_convs, reg_convs, reg_conf

    def _init_layers(self):
        """Initialize layers of the head."""
        self.relu = nn.ReLU(inplace=True)
        self.cls_convs = nn.ModuleList()
        self.reg_convs = nn.ModuleList()
        self.reg_confs = nn.ModuleList()

        for i in range(len(self.strides)):
            cls_convs, reg_convs, reg_conf = self._build_not_shared_convs(
                self.in_channels[i], self.feat_channels[i])
            self.cls_convs.append(cls_convs)
            self.reg_convs.append(reg_convs)
            self.reg_confs.append(reg_conf)

        self.gfl_cls = nn.ModuleList([
            nn.Conv2d(
                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
            for i in range(len(self.strides))
        ])

        self.gfl_reg = nn.ModuleList([
            nn.Conv2d(
                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
            for i in range(len(self.strides))
        ])

        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])

    def forward(self,
                xin,
                labels=None,
                imgs=None,
                conf_thre=0.05,
                nms_thre=0.7):

        # prepare labels during training
        b, c, h, w = xin[0].shape
        if labels is not None:
            gt_bbox_list = []
            gt_cls_list = []
            for label in labels:
                gt_bbox_list.append(label.bbox)
                gt_cls_list.append((label.get_field('labels')
                                    - 1).long())  # labels starts from 1

        # prepare priors for label assignment and bbox decode
        mlvl_priors_list = [
            self.get_single_level_center_priors(
                xin[i].shape[0],
                xin[i].shape[-2:],
                stride,
                dtype=torch.float32,
                device=xin[0].device) for i, stride in enumerate(self.strides)
        ]
        mlvl_priors = torch.cat(mlvl_priors_list, dim=1)

        # forward for bboxes and classification prediction
        cls_scores, bbox_preds = multi_apply(
            self.forward_single,
            xin,
            self.cls_convs,
            self.reg_convs,
            self.gfl_cls,
            self.gfl_reg,
            self.reg_confs,
            self.scales,
        )
        flatten_cls_scores = torch.cat(cls_scores, dim=1)
        flatten_bbox_preds = torch.cat(bbox_preds, dim=1)

        # calculating losses or bboxes decoded
        if self.training:
            loss = self.loss(flatten_cls_scores, flatten_bbox_preds,
                             gt_bbox_list, gt_cls_list, mlvl_priors)
            return loss
        else:
            output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds,
                                     mlvl_priors)
            return output

    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg,
                       reg_conf, scale):
        """Forward feature of a single scale level.

        """
        cls_feat = x
        reg_feat = x

        for cls_conv in cls_convs:
            cls_feat = cls_conv(cls_feat)
        for reg_conv in reg_convs:
            reg_feat = reg_conv(reg_feat)

        bbox_pred = scale(gfl_reg(reg_feat)).float()
        N, C, H, W = bbox_pred.size()
        prob = F.softmax(
            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
        if not self.simlqe:
            prob_topk, _ = prob.topk(self.reg_topk, dim=2)

            if self.add_mean:
                stat = torch.cat(
                    [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2)
            else:
                stat = prob_topk

            quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W))
        else:
            quality_score = reg_conf(
                bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))

        cls_score = gfl_cls(cls_feat).sigmoid() * quality_score

        flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2)
        flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2)
        return flatten_cls_score, flatten_bbox_pred

    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
                                       dtype, device):

        h, w = featmap_size
        x_range = (torch.arange(0, int(w), dtype=dtype,
                                device=device)) * stride
        y_range = (torch.arange(0, int(h), dtype=dtype,
                                device=device)) * stride

        x = x_range.repeat(h, 1)
        y = y_range.unsqueeze(-1).repeat(1, w)

        y = y.flatten()
        x = x.flatten()
        strides = x.new_full((x.shape[0], ), stride)
        priors = torch.stack([x, y, strides, strides], dim=-1)

        return priors.unsqueeze(0).repeat(batch_size, 1, 1)

    def sample(self, assign_result, gt_bboxes):
        pos_inds = torch.nonzero(
            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
        neg_inds = torch.nonzero(
            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1

        if gt_bboxes.numel() == 0:
            # hack for index error case
            assert pos_assigned_gt_inds.numel() == 0
            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
        else:
            if len(gt_bboxes.shape) < 2:
                gt_bboxes = gt_bboxes.view(-1, 4)
            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]

        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds

    def get_bboxes(self,
                   cls_preds,
                   reg_preds,
                   mlvl_center_priors,
                   img_meta=None):

        dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None]
        bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)

        res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1)

        return res
--- a/modelscope/models/cv/tinynas_detection/neck/init.py
+++ b/modelscope/models/cv/tinynas_detection/neck/init.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import copy

 from .giraffe_fpn import GiraffeNeck
 from .giraffe_fpn_v2 import GiraffeNeckV2


 def build_neck(cfg):
    neck_cfg = copy.deepcopy(cfg)
    name = neck_cfg.pop('name')
    if name == 'GiraffeNeck':
        return GiraffeNeck(**neck_cfg)
    elif name == 'GiraffeNeckV2':
        return GiraffeNeckV2(**neck_cfg)
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
@@ -0,0 +1,235 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import collections
 import itertools
 import os

 import networkx as nx
 from omegaconf import OmegaConf

 Node = collections.namedtuple('Node', ['id', 'inputs', 'type'])


 def get_graph_info(graph):
    input_nodes = []
    output_nodes = []
    Nodes = []
    for node in range(graph.number_of_nodes()):
        tmp = list(graph.neighbors(node))
        tmp.sort()
        type = -1
        if node < tmp[0]:
            input_nodes.append(node)
            type = 0
        if node > tmp[-1]:
            output_nodes.append(node)
            type = 1
        Nodes.append(Node(node, [n for n in tmp if n < node], type))
    return Nodes, input_nodes, output_nodes


 def nodeid_trans(id, cur_level, num_levels):
    if id % 2 == 1:
        gap = int(((id + 1) // 2) * num_levels * 2)
    else:
        a = (num_levels - cur_level) * 2 - 1
        b = ((id + 1) // 2) * num_levels * 2
        gap = int(a + b)
    return cur_level + gap


 def gen_log2n_graph_file(log2n_graph_file, depth_multiplier):
    f = open(log2n_graph_file, 'w')
    for i in range(depth_multiplier):
        for j in [1, 2, 4, 8, 16, 32]:
            if i - j < 0:
                break
            else:
                f.write('%d,%d\n' % (i - j, i))
    f.close()


 def get_log2n_graph(depth_multiplier):
    nodes = []
    connnections = []

    for i in range(depth_multiplier):
        nodes.append(i)
        for j in [1, 2, 4, 8, 16, 32]:
            if i - j < 0:
                break
            else:
                connnections.append((i - j, i))
    return nodes, connnections


 def get_dense_graph(depth_multiplier):
    nodes = []
    connections = []

    for i in range(depth_multiplier):
        nodes.append(i)
        for j in range(i):
            connections.append((j, i))
    return nodes, connections


 def giraffeneck_config(min_level,
                       max_level,
                       weight_method=None,
                       depth_multiplier=5,
                       with_backslash=False,
                       with_slash=False,
                       with_skip_connect=False,
                       skip_connect_type='dense'):
    """Graph config with log2n merge and panet"""
    if skip_connect_type == 'dense':
        nodes, connections = get_dense_graph(depth_multiplier)
    elif skip_connect_type == 'log2n':
        nodes, connections = get_log2n_graph(depth_multiplier)
    graph = nx.Graph()
    graph.add_nodes_from(nodes)
    graph.add_edges_from(connections)

    drop_node = []
    nodes, input_nodes, output_nodes = get_graph_info(graph)

    weight_method = weight_method or 'fastattn'

    num_levels = max_level - min_level + 1
    node_ids = {min_level + i: [i] for i in range(num_levels)}
    node_ids_per_layer = {}

    pnodes = {}

    def update_drop_node(new_id, input_offsets):
        if new_id not in drop_node:
            new_id = new_id
        else:
            while new_id in drop_node:
                if new_id in pnodes:
                    for n in pnodes[new_id]['inputs_offsets']:
                        if n not in input_offsets and n not in drop_node:
                            input_offsets.append(n)
                new_id = new_id - 1
        if new_id not in input_offsets:
            input_offsets.append(new_id)

    # top-down layer
    for i in range(max_level, min_level - 1, -1):
        node_ids_per_layer[i] = []
        for id, node in enumerate(nodes):
            input_offsets = []
            if id in input_nodes:
                input_offsets.append(node_ids[i][0])
            else:
                if with_skip_connect:
                    for input_id in node.inputs:
                        new_id = nodeid_trans(input_id, i - min_level,
                                              num_levels)
                        update_drop_node(new_id, input_offsets)

            # add top2down
            new_id = nodeid_trans(id, i - min_level, num_levels)

            # add backslash node
            def cal_backslash_node(id):
                ind = id // num_levels
                mod = id % num_levels
                if ind % 2 == 0:  # even
                    if mod == (num_levels - 1):
                        last = -1
                    else:
                        last = (ind - 1) * num_levels + (
                            num_levels - 1 - mod - 1)
                else:  # odd
                    if mod == 0:
                        last = -1
                    else:
                        last = (ind - 1) * num_levels + (
                            num_levels - 1 - mod + 1)

                return last

            # add slash node
            def cal_slash_node(id):
                ind = id // num_levels
                mod = id % num_levels
                if ind % 2 == 1:  # odd
                    if mod == (num_levels - 1):
                        last = -1
                    else:
                        last = (ind - 1) * num_levels + (
                            num_levels - 1 - mod - 1)
                else:  # even
                    if mod == 0:
                        last = -1
                    else:
                        last = (ind - 1) * num_levels + (
                            num_levels - 1 - mod + 1)

                return last

            # add last node
            last = new_id - 1
            update_drop_node(last, input_offsets)

            if with_backslash:
                backslash = cal_backslash_node(new_id)
                if backslash != -1 and backslash not in input_offsets:
                    input_offsets.append(backslash)

            if with_slash:
                slash = cal_slash_node(new_id)
                if slash != -1 and slash not in input_offsets:
                    input_offsets.append(slash)

            if new_id in drop_node:
                input_offsets = []

            pnodes[new_id] = {
                'reduction': 1 << i,
                'inputs_offsets': input_offsets,
                'weight_method': weight_method,
                'is_out': 0,
            }

        input_offsets = []
        for out_id in output_nodes:
            new_id = nodeid_trans(out_id, i - min_level, num_levels)
            input_offsets.append(new_id)

        pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = {
            'reduction': 1 << i,
            'inputs_offsets': input_offsets,
            'weight_method': weight_method,
            'is_out': 1,
        }

    pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0]))
    return pnodes


 def get_graph_config(fpn_name,
                     min_level=3,
                     max_level=7,
                     weight_method='concat',
                     depth_multiplier=5,
                     with_backslash=False,
                     with_slash=False,
                     with_skip_connect=False,
                     skip_connect_type='dense'):
    name_to_config = {
        'giraffeneck':
        giraffeneck_config(
            min_level=min_level,
            max_level=max_level,
            weight_method=weight_method,
            depth_multiplier=depth_multiplier,
            with_backslash=with_backslash,
            with_slash=with_slash,
            with_skip_connect=with_skip_connect,
            skip_connect_type=skip_connect_type),
    }
    return name_to_config[fpn_name]
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
@@ -0,0 +1,661 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import logging
 import math
 from collections import OrderedDict
 from functools import partial
 from typing import Callable, List, Optional, Tuple, Union

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from timm import create_model
 from timm.models.layers import (Swish, create_conv2d, create_pool2d,
                                get_act_layer)

 from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
 from .giraffe_config import get_graph_config

 _ACT_LAYER = Swish


 class SequentialList(nn.Sequential):
    """ This module exists to work around torchscript typing issues list -> list"""

    def __init__(self, *args):
        super(SequentialList, self).__init__(*args)

    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
        for module in self:
            x = module(x)
        return x


 class ConvBnAct2d(nn.Module):

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 dilation=1,
                 padding='',
                 bias=False,
                 norm_layer=nn.BatchNorm2d,
                 act_layer=_ACT_LAYER):
        super(ConvBnAct2d, self).__init__()

        self.conv = create_conv2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding,
            bias=bias)
        self.bn = None if norm_layer is None else norm_layer(out_channels)
        self.act = None if act_layer is None else act_layer(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.act is not None:
            x = self.act(x)
        return x


 class SeparableConv2d(nn.Module):
    """ Separable Conv
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=3,
                 stride=1,
                 dilation=1,
                 padding='',
                 bias=False,
                 channel_multiplier=1.0,
                 pw_kernel_size=1,
                 norm_layer=nn.BatchNorm2d,
                 act_layer=_ACT_LAYER):
        super(SeparableConv2d, self).__init__()
        self.conv_dw = create_conv2d(
            in_channels,
            int(in_channels * channel_multiplier),
            kernel_size,
            stride=stride,
            dilation=dilation,
            padding=padding,
            depthwise=True)

        self.conv_pw = create_conv2d(
            int(in_channels * channel_multiplier),
            out_channels,
            pw_kernel_size,
            padding=padding,
            bias=bias)

        self.bn = None if norm_layer is None else norm_layer(out_channels)
        self.act = None if act_layer is None else act_layer(inplace=True)

    def forward(self, x):
        x = self.conv_dw(x)
        x = self.conv_pw(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.act is not None:
            x = self.act(x)
        return x


 def _init_weight(
    m,
    n='',
 ):
    """ Weight initialization as per Tensorflow official implementations.
    """

    def _fan_in_out(w, groups=1):
        dimensions = w.dim()
        if dimensions < 2:
            raise ValueError(
                'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions'
            )
        num_input_fmaps = w.size(1)
        num_output_fmaps = w.size(0)
        receptive_field_size = 1
        if w.dim() > 2:
            receptive_field_size = w[0][0].numel()
        fan_in = num_input_fmaps * receptive_field_size
        fan_out = num_output_fmaps * receptive_field_size
        fan_out //= groups
        return fan_in, fan_out

    def _glorot_uniform(w, gain=1, groups=1):
        fan_in, fan_out = _fan_in_out(w, groups)
        gain /= max(1., (fan_in + fan_out) / 2.)  # fan avg
        limit = math.sqrt(3.0 * gain)
        w.data.uniform_(-limit, limit)

    def _variance_scaling(w, gain=1, groups=1):
        fan_in, fan_out = _fan_in_out(w, groups)
        gain /= max(1., fan_in)  # fan in
        std = math.sqrt(gain)
        w.data.normal_(std=std)

    if isinstance(m, SeparableConv2d):
        if 'box_net' in n or 'class_net' in n:
            _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups)
            _variance_scaling(m.conv_pw.weight)
            if m.conv_pw.bias is not None:
                if 'class_net.predict' in n:
                    m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
                else:
                    m.conv_pw.bias.data.zero_()
        else:
            _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups)
            _glorot_uniform(m.conv_pw.weight)
            if m.conv_pw.bias is not None:
                m.conv_pw.bias.data.zero_()
    elif isinstance(m, ConvBnAct2d):
        if 'box_net' in n or 'class_net' in n:
            m.conv.weight.data.normal_(std=.01)
            if m.conv.bias is not None:
                if 'class_net.predict' in n:
                    m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
                else:
                    m.conv.bias.data.zero_()
        else:
            _glorot_uniform(m.conv.weight)
            if m.conv.bias is not None:
                m.conv.bias.data.zero_()
    elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1.0)
        m.bias.data.zero_()


 def _init_weight_alt(
    m,
    n='',
 ):
    """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition
    NOTE: this will likely be removed after some experimentation
    """
    if isinstance(m, nn.Conv2d):
        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        fan_out //= m.groups
        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
        if m.bias is not None:
            if 'class_net.predict' in n:
                m.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
            else:
                m.bias.data.zero_()
    elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1.0)
        m.bias.data.zero_()


 class Interpolate2d(nn.Module):
    r"""Resamples a 2d Image

    The input data is assumed to be of the form
    `minibatch x channels x [optional depth] x [optional height] x width`.
    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.

    The algorithms available for upsampling are nearest neighbor and linear,
    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
    respectively.

    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
    calculate the output size. (You cannot give both, as it is ambiguous)

    Args:
        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
            output spatial sizes
        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
            multiplier for spatial size. Has to match input size if it is a tuple.
        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
            Default: ``'nearest'``
        align_corners (bool, optional): if ``True``, the corner pixels of the input
            and output tensors are aligned, and thus preserving the values at
            those pixels. This only has effect when :attr:`mode` is
            ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False``
    """
    __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name']
    name: str
    size: Optional[Union[int, Tuple[int, int]]]
    scale_factor: Optional[Union[float, Tuple[float, float]]]
    mode: str
    align_corners: Optional[bool]

    def __init__(self,
                 size: Optional[Union[int, Tuple[int, int]]] = None,
                 scale_factor: Optional[Union[float, Tuple[float,
                                                           float]]] = None,
                 mode: str = 'nearest',
                 align_corners: bool = False) -> None:
        super(Interpolate2d, self).__init__()
        self.name = type(self).__name__
        self.size = size
        if isinstance(scale_factor, tuple):
            self.scale_factor = tuple(float(factor) for factor in scale_factor)
        else:
            self.scale_factor = float(scale_factor) if scale_factor else None
        self.mode = mode
        self.align_corners = None if mode == 'nearest' else align_corners

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.interpolate(
            input,
            self.size,
            self.scale_factor,
            self.mode,
            self.align_corners,
            recompute_scale_factor=False)


 class ResampleFeatureMap(nn.Sequential):

    def __init__(self,
                 in_channels,
                 out_channels,
                 reduction_ratio=1.,
                 pad_type='',
                 downsample=None,
                 upsample=None,
                 norm_layer=nn.BatchNorm2d,
                 apply_bn=False,
                 conv_after_downsample=False,
                 redundant_bias=False):
        super(ResampleFeatureMap, self).__init__()
        downsample = downsample or 'max'
        upsample = upsample or 'nearest'
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.reduction_ratio = reduction_ratio
        self.conv_after_downsample = conv_after_downsample

        conv = None
        if in_channels != out_channels:
            conv = ConvBnAct2d(
                in_channels,
                out_channels,
                kernel_size=1,
                padding=pad_type,
                norm_layer=norm_layer if apply_bn else None,
                bias=not apply_bn or redundant_bias,
                act_layer=None)

        if reduction_ratio > 1:
            if conv is not None and not self.conv_after_downsample:
                self.add_module('conv', conv)
            if downsample in ('max', 'avg'):
                stride_size = int(reduction_ratio)
                downsample = create_pool2d(
                    downsample,
                    kernel_size=stride_size + 1,
                    stride=stride_size,
                    padding=pad_type)
            else:
                downsample = Interpolate2d(
                    scale_factor=1. / reduction_ratio, mode=downsample)
            self.add_module('downsample', downsample)
            if conv is not None and self.conv_after_downsample:
                self.add_module('conv', conv)
        else:
            if conv is not None:
                self.add_module('conv', conv)
            if reduction_ratio < 1:
                scale = int(1 // reduction_ratio)
                self.add_module(
                    'upsample',
                    Interpolate2d(scale_factor=scale, mode=upsample))


 class GiraffeCombine(nn.Module):

    def __init__(self,
                 feature_info,
                 fpn_config,
                 fpn_channels,
                 inputs_offsets,
                 target_reduction,
                 pad_type='',
                 downsample=None,
                 upsample=None,
                 norm_layer=nn.BatchNorm2d,
                 apply_resample_bn=False,
                 conv_after_downsample=False,
                 redundant_bias=False,
                 weight_method='attn'):
        super(GiraffeCombine, self).__init__()
        self.inputs_offsets = inputs_offsets
        self.weight_method = weight_method

        self.resample = nn.ModuleDict()
        reduction_base = feature_info[0]['reduction']

        target_channels_idx = int(
            math.log(target_reduction // reduction_base, 2))
        for idx, offset in enumerate(inputs_offsets):
            if offset < len(feature_info):
                in_channels = feature_info[offset]['num_chs']
                input_reduction = feature_info[offset]['reduction']
            else:
                node_idx = offset
                input_reduction = fpn_config[node_idx]['reduction']
                # in_channels = fpn_config[node_idx]['num_chs']
                input_channels_idx = int(
                    math.log(input_reduction // reduction_base, 2))
                in_channels = feature_info[input_channels_idx]['num_chs']

            reduction_ratio = target_reduction / input_reduction
            if weight_method == 'concat':
                self.resample[str(offset)] = ResampleFeatureMap(
                    in_channels,
                    in_channels,
                    reduction_ratio=reduction_ratio,
                    pad_type=pad_type,
                    downsample=downsample,
                    upsample=upsample,
                    norm_layer=norm_layer,
                    apply_bn=apply_resample_bn,
                    conv_after_downsample=conv_after_downsample,
                    redundant_bias=redundant_bias)
            else:
                self.resample[str(offset)] = ResampleFeatureMap(
                    in_channels,
                    fpn_channels[target_channels_idx],
                    reduction_ratio=reduction_ratio,
                    pad_type=pad_type,
                    downsample=downsample,
                    upsample=upsample,
                    norm_layer=norm_layer,
                    apply_bn=apply_resample_bn,
                    conv_after_downsample=conv_after_downsample,
                    redundant_bias=redundant_bias)

        if weight_method == 'attn' or weight_method == 'fastattn':
            self.edge_weights = nn.Parameter(
                torch.ones(len(inputs_offsets)), requires_grad=True)  # WSM
        else:
            self.edge_weights = None

    def forward(self, x: List[torch.Tensor]):
        dtype = x[0].dtype
        nodes = []
        if len(self.inputs_offsets) == 0:
            return None
        for offset, resample in zip(self.inputs_offsets,
                                    self.resample.values()):
            input_node = x[offset]
            input_node = resample(input_node)
            nodes.append(input_node)

        if self.weight_method == 'attn':
            normalized_weights = torch.softmax(
                self.edge_weights.to(dtype=dtype), dim=0)
            out = torch.stack(nodes, dim=-1) * normalized_weights
            out = torch.sum(out, dim=-1)
        elif self.weight_method == 'fastattn':
            edge_weights = nn.functional.relu(
                self.edge_weights.to(dtype=dtype))
            weights_sum = torch.sum(edge_weights)
            weights_norm = weights_sum + 0.0001
            out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm
                               for i in range(len(nodes))],
                              dim=-1)

            out = torch.sum(out, dim=-1)
        elif self.weight_method == 'sum':
            out = torch.stack(nodes, dim=-1)
            out = torch.sum(out, dim=-1)
        elif self.weight_method == 'concat':
            out = torch.cat(nodes, dim=1)
        else:
            raise ValueError('unknown weight_method {}'.format(
                self.weight_method))
        return out


 class GiraffeNode(nn.Module):
    """ A simple wrapper used in place of nn.Sequential for torchscript typing
    Handles input type List[Tensor] -> output type Tensor
    """

    def __init__(self, combine: nn.Module, after_combine: nn.Module):
        super(GiraffeNode, self).__init__()
        self.combine = combine
        self.after_combine = after_combine

    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
        combine_feat = self.combine(x)
        if combine_feat is None:
            return None
        else:
            return self.after_combine(combine_feat)


 class GiraffeLayer(nn.Module):

    def __init__(self,
                 feature_info,
                 fpn_config,
                 inner_fpn_channels,
                 outer_fpn_channels,
                 num_levels=5,
                 pad_type='',
                 downsample=None,
                 upsample=None,
                 norm_layer=nn.BatchNorm2d,
                 act_layer=_ACT_LAYER,
                 apply_resample_bn=False,
                 conv_after_downsample=True,
                 conv_bn_relu_pattern=False,
                 separable_conv=True,
                 redundant_bias=False,
                 merge_type='conv'):
        super(GiraffeLayer, self).__init__()
        self.num_levels = num_levels
        self.conv_bn_relu_pattern = False

        self.feature_info = {}
        for idx, feat in enumerate(feature_info):
            self.feature_info[idx] = feat

        self.fnode = nn.ModuleList()
        reduction_base = feature_info[0]['reduction']
        for i, fnode_cfg in fpn_config.items():
            logging.debug('fnode {} : {}'.format(i, fnode_cfg))

            if fnode_cfg['is_out'] == 1:
                fpn_channels = outer_fpn_channels
            else:
                fpn_channels = inner_fpn_channels

            reduction = fnode_cfg['reduction']
            fpn_channels_idx = int(math.log(reduction // reduction_base, 2))
            combine = GiraffeCombine(
                self.feature_info,
                fpn_config,
                fpn_channels,
                tuple(fnode_cfg['inputs_offsets']),
                target_reduction=reduction,
                pad_type=pad_type,
                downsample=downsample,
                upsample=upsample,
                norm_layer=norm_layer,
                apply_resample_bn=apply_resample_bn,
                conv_after_downsample=conv_after_downsample,
                redundant_bias=redundant_bias,
                weight_method=fnode_cfg['weight_method'])

            after_combine = nn.Sequential()

            in_channels = 0
            out_channels = 0
            for input_offset in fnode_cfg['inputs_offsets']:
                in_channels += self.feature_info[input_offset]['num_chs']

            out_channels = fpn_channels[fpn_channels_idx]

            if merge_type == 'csp':
                after_combine.add_module(
                    'CspLayer',
                    CSPLayer(
                        in_channels,
                        out_channels,
                        2,
                        shortcut=True,
                        depthwise=False,
                        act='silu'))
            elif merge_type == 'shuffle':
                after_combine.add_module(
                    'shuffleBlock', ShuffleBlock(in_channels, in_channels))
                after_combine.add_module(
                    'conv1x1',
                    create_conv2d(in_channels, out_channels, kernel_size=1))
            elif merge_type == 'conv':
                after_combine.add_module(
                    'conv1x1',
                    create_conv2d(in_channels, out_channels, kernel_size=1))
                conv_kwargs = dict(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    padding=pad_type,
                    bias=False,
                    norm_layer=norm_layer,
                    act_layer=act_layer)
                if not conv_bn_relu_pattern:
                    conv_kwargs['bias'] = redundant_bias
                    conv_kwargs['act_layer'] = None
                    after_combine.add_module('act', act_layer(inplace=True))
                after_combine.add_module(
                    'conv',
                    SeparableConv2d(**conv_kwargs)
                    if separable_conv else ConvBnAct2d(**conv_kwargs))

            self.fnode.append(
                GiraffeNode(combine=combine, after_combine=after_combine))
            self.feature_info[i] = dict(
                num_chs=fpn_channels[fpn_channels_idx], reduction=reduction)

        self.out_feature_info = []
        out_node = list(self.feature_info.keys())[-num_levels::]
        for i in out_node:
            self.out_feature_info.append(self.feature_info[i])

        self.feature_info = self.out_feature_info

    def forward(self, x: List[torch.Tensor]):
        for fn in self.fnode:
            x.append(fn(x))
        return x[-self.num_levels::]


 class GiraffeNeck(nn.Module):

    def __init__(self, min_level, max_level, num_levels, norm_layer,
                 norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels,
                 out_fpn_channels, weight_method, depth_multiplier,
                 width_multiplier, with_backslash, with_slash,
                 with_skip_connect, skip_connect_type, separable_conv,
                 feature_info, merge_type, pad_type, downsample_type,
                 upsample_type, apply_resample_bn, conv_after_downsample,
                 redundant_bias, conv_bn_relu_pattern, alternate_init):
        super(GiraffeNeck, self).__init__()

        self.num_levels = num_levels
        self.min_level = min_level
        self.in_features = [0, 1, 2, 3, 4, 5,
                            6][self.min_level - 1:self.min_level - 1
                               + num_levels]
        self.alternate_init = alternate_init
        norm_layer = norm_layer or nn.BatchNorm2d
        if norm_kwargs:
            norm_layer = partial(norm_layer, **norm_kwargs)
        act_layer = get_act_layer(act_type) or _ACT_LAYER
        fpn_config = fpn_config or get_graph_config(
            fpn_name,
            min_level=min_level,
            max_level=max_level,
            weight_method=weight_method,
            depth_multiplier=depth_multiplier,
            with_backslash=with_backslash,
            with_slash=with_slash,
            with_skip_connect=with_skip_connect,
            skip_connect_type=skip_connect_type)

        # width scale
        for i in range(len(fpn_channels)):
            fpn_channels[i] = int(fpn_channels[i] * width_multiplier)

        self.resample = nn.ModuleDict()
        for level in range(num_levels):
            if level < len(feature_info):
                in_chs = feature_info[level]['num_chs']
                reduction = feature_info[level]['reduction']
            else:
                # Adds a coarser level by downsampling the last feature map
                reduction_ratio = 2
                self.resample[str(level)] = ResampleFeatureMap(
                    in_channels=in_chs,
                    out_channels=feature_info[level - 1]['num_chs'],
                    pad_type=pad_type,
                    downsample=downsample_type,
                    upsample=upsample_type,
                    norm_layer=norm_layer,
                    reduction_ratio=reduction_ratio,
                    apply_bn=apply_resample_bn,
                    conv_after_downsample=conv_after_downsample,
                    redundant_bias=redundant_bias,
                )
                in_chs = feature_info[level - 1]['num_chs']
                reduction = int(reduction * reduction_ratio)
                feature_info.append(dict(num_chs=in_chs, reduction=reduction))

        self.cell = SequentialList()
        logging.debug('building giraffeNeck')
        giraffe_layer = GiraffeLayer(
            feature_info=feature_info,
            fpn_config=fpn_config,
            inner_fpn_channels=fpn_channels,
            outer_fpn_channels=out_fpn_channels,
            num_levels=num_levels,
            pad_type=pad_type,
            downsample=downsample_type,
            upsample=upsample_type,
            norm_layer=norm_layer,
            act_layer=act_layer,
            separable_conv=separable_conv,
            apply_resample_bn=apply_resample_bn,
            conv_after_downsample=conv_after_downsample,
            conv_bn_relu_pattern=conv_bn_relu_pattern,
            redundant_bias=redundant_bias,
            merge_type=merge_type)
        self.cell.add_module('giraffeNeck', giraffe_layer)
        feature_info = giraffe_layer.feature_info

    def init_weights(self, pretrained=False):
        for n, m in self.named_modules():
            if 'backbone' not in n:
                if self.alternate_init:
                    _init_weight_alt(m, n)
                else:
                    _init_weight(m, n)

    def forward(self, x: List[torch.Tensor]):
        if type(x) is tuple:
            x = list(x)
        x = [x[f] for f in self.in_features]
        for resample in self.resample.values():
            x.append(resample(x[-1]))
        x = self.cell(x)
        return x
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
@@ -0,0 +1,203 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import torch
 import torch.nn as nn

 from ..core.base_ops import BaseConv, CSPLayer, DWConv
 from ..core.neck_ops import CSPStage


 class GiraffeNeckV2(nn.Module):

    def __init__(
        self,
        depth=1.0,
        width=1.0,
        in_features=[2, 3, 4],
        in_channels=[256, 512, 1024],
        out_channels=[256, 512, 1024],
        depthwise=False,
        act='silu',
        spp=True,
        reparam_mode=True,
        block_name='BasicBlock',
    ):
        super().__init__()
        self.in_features = in_features
        self.in_channels = in_channels
        Conv = DWConv if depthwise else BaseConv

        reparam_mode = reparam_mode

        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

        # node x3: input x0, x1
        self.bu_conv13 = Conv(
            int(in_channels[1] * width),
            int(in_channels[1] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_3 = CSPStage(
                block_name,
                int((in_channels[1] + in_channels[2]) * width),
                int(in_channels[2] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_3 = CSPLayer(
                int((in_channels[1] + in_channels[2]) * width),
                int(in_channels[2] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x4: input x1, x2, x3
        self.bu_conv24 = Conv(
            int(in_channels[0] * width),
            int(in_channels[0] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_4 = CSPStage(
                block_name,
                int((in_channels[0] + in_channels[1] + in_channels[2])
                    * width),
                int(in_channels[1] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_4 = CSPLayer(
                int((in_channels[0] + in_channels[1] + in_channels[2])
                    * width),
                int(in_channels[1] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x5: input x2, x4
        if reparam_mode:
            self.merge_5 = CSPStage(
                block_name,
                int((in_channels[1] + in_channels[0]) * width),
                int(out_channels[0] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_5 = CSPLayer(
                int((in_channels[1] + in_channels[0]) * width),
                int(out_channels[0] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x7: input x4, x5
        self.bu_conv57 = Conv(
            int(out_channels[0] * width),
            int(out_channels[0] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_7 = CSPStage(
                block_name,
                int((out_channels[0] + in_channels[1]) * width),
                int(out_channels[1] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_7 = CSPLayer(
                int((out_channels[0] + in_channels[1]) * width),
                int(out_channels[1] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

        # node x6: input x3, x4, x7
        self.bu_conv46 = Conv(
            int(in_channels[1] * width),
            int(in_channels[1] * width),
            3,
            2,
            act=act)
        self.bu_conv76 = Conv(
            int(out_channels[1] * width),
            int(out_channels[1] * width),
            3,
            2,
            act=act)
        if reparam_mode:
            self.merge_6 = CSPStage(
                block_name,
                int((in_channels[1] + out_channels[1] + in_channels[2])
                    * width),
                int(out_channels[2] * width),
                round(3 * depth),
                act=act,
                spp=spp)
        else:
            self.merge_6 = CSPLayer(
                int((in_channels[1] + out_channels[1] + in_channels[2])
                    * width),
                int(out_channels[2] * width),
                round(3 * depth),
                False,
                depthwise=depthwise,
                act=act)

    def init_weights(self):
        pass

    def forward(self, out_features):
        """
        Args:
            inputs: input images.

        Returns:
            Tuple[Tensor]: FPN feature.
        """

        #  backbone
        features = [out_features[f] for f in self.in_features]
        [x2, x1, x0] = features

        # node x3
        x13 = self.bu_conv13(x1)
        x3 = torch.cat([x0, x13], 1)
        x3 = self.merge_3(x3)

        # node x4
        x34 = self.upsample(x3)
        x24 = self.bu_conv24(x2)
        x4 = torch.cat([x1, x24, x34], 1)
        x4 = self.merge_4(x4)

        # node x5
        x45 = self.upsample(x4)
        x5 = torch.cat([x2, x45], 1)
        x5 = self.merge_5(x5)

        # node x7
        x57 = self.bu_conv57(x5)
        x7 = torch.cat([x4, x57], 1)
        x7 = self.merge_7(x7)

        # node x6
        x46 = self.bu_conv46(x4)
        x76 = self.bu_conv76(x7)
        x6 = torch.cat([x3, x46, x76], 1)
        x6 = self.merge_6(x6)

        outputs = (x5, x7, x6)
        return outputs
--- a/modelscope/models/cv/tinynas_detection/tinynas_detector.py
+++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
@@ -0,0 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 from .detector import SingleStageDetector


@MODELS.register_module(
    Tasks.image_object_detection, module_name=Models.tinynas_detection)
 class TinynasDetector(SingleStageDetector):

    def __init__(self, model_dir, *args, **kwargs):

        super(TinynasDetector, self).__init__(model_dir, *args, **kwargs)
--- a/modelscope/models/cv/tinynas_detection/utils.py
+++ b/modelscope/models/cv/tinynas_detection/utils.py
@@ -0,0 +1,30 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.

 import importlib
 import os
 import sys
 from os.path import dirname, join


 def get_config_by_file(config_file):
    try:
        sys.path.append(os.path.dirname(config_file))
        current_config = importlib.import_module(
            os.path.basename(config_file).split('.')[0])
        exp = current_config.Config()
    except Exception:
        raise ImportError(
            "{} doesn't contains class named 'Config'".format(config_file))
    return exp


 def parse_config(config_file):
    """
    get config object by file.
    Args:
        config_file (str): file path of config.
    """
    assert (config_file is not None), 'plz provide config file'
    if config_file is not None:
        return get_config_by_file(config_file)
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1867,11 +1867,13 @@ class MPlug(PreTrainedModel):
                                           ModelFile.TORCH_MODEL_BIN_FILE)
            checkpoint = torch.load(checkpoint_path, map_location='cpu')
            if 'model' in checkpoint:
                state_dict = checkpoint['model']
            else:
                state_dict = checkpoint['module']
                checkpoint = checkpoint['model']
            checkpoint = {
                k.replace('model.', ''): v
                for k, v in checkpoint.items()
            }

            msg = model.load_state_dict(state_dict, strict=False)
            msg = model.load_state_dict(checkpoint, strict=False)
            print('load checkpoint from %s' % checkpoint_path)
            print(msg)
        return model
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -9,12 +9,15 @@ if TYPE_CHECKING:
    from .bert_for_sequence_classification import BertForSequenceClassification
    from .bert_for_document_segmentation import BertForDocumentSegmentation
    from .csanmt_for_translation import CsanmtForTranslation
    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
                                  BertForMaskedLM)
    from .masked_language import (
        StructBertForMaskedLM,
        VecoForMaskedLM,
        BertForMaskedLM,
        DebertaV2ForMaskedLM,
    )
    from .nncrf_for_named_entity_recognition import (
        TransformerCRFForNamedEntityRecognition,
        LSTMCRFForNamedEntityRecognition)
    from .palm_v2 import PalmForTextGeneration
    from .token_classification import SbertForTokenClassification
    from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
    from .space import SpaceForDialogIntent
@@ -22,7 +25,6 @@ if TYPE_CHECKING:
    from .space import SpaceForDialogStateTracking
    from .star_text_to_sql import StarForTextToSql
    from .task_models import (InformationExtractionModel,
                              SequenceClassificationModel,
                              SingleBackboneTaskModelBase)
    from .bart_for_text_error_correction import BartForTextErrorCorrection
    from .gpt3 import GPT3ForTextGeneration
@@ -36,8 +38,10 @@ else:
        'csanmt_for_translation': ['CsanmtForTranslation'],
        'bert_for_sequence_classification': ['BertForSequenceClassification'],
        'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
        'masked_language':
        ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
        'masked_language': [
            'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
            'DebertaV2ForMaskedLM'
        ],
        'nncrf_for_named_entity_recognition': [
            'TransformerCRFForNamedEntityRecognition',
            'LSTMCRFForNamedEntityRecognition'
--- a/modelscope/models/nlp/deberta_v2/init.py
+++ b/modelscope/models/nlp/deberta_v2/init.py
@@ -0,0 +1,73 @@
 # flake8: noqa
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.

 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 _import_structure = {
    'configuration_deberta_v2': [
        'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config',
        'DebertaV2OnnxConfig'
    ],
    'tokenization_deberta_v2': ['DebertaV2Tokenizer'],
 }

 if TYPE_CHECKING:
    from .configuration_deberta_v2 import DebertaV2Config
    from .tokenization_deberta_v2 import DebertaV2Tokenizer
    from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast

    from .modeling_deberta_v2 import (
        DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
        DebertaV2ForMaskedLM,
        DebertaV2ForMultipleChoice,
        DebertaV2ForQuestionAnswering,
        DebertaV2ForSequenceClassification,
        DebertaV2ForTokenClassification,
        DebertaV2Model,
        DebertaV2PreTrainedModel,
    )

 else:
    _import_structure = {
        'configuration_deberta_v2':
        ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
        'tokenization_deberta_v2': ['DebertaV2Tokenizer']
    }
    _import_structure['tokenization_deberta_v2_fast'] = [
        'DebertaV2TokenizerFast'
    ]
    _import_structure['modeling_deberta_v2'] = [
        'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST',
        'DebertaV2ForMaskedLM',
        'DebertaV2ForMultipleChoice',
        'DebertaV2ForQuestionAnswering',
        'DebertaV2ForSequenceClassification',
        'DebertaV2ForTokenClassification',
        'DebertaV2Model',
        'DebertaV2PreTrainedModel',
    ]
    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__)
--- a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
@@ -0,0 +1,130 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright 2020, Microsoft and the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union

 from transformers import PretrainedConfig

 from modelscope.utils import logger as logging

 logger = logging.get_logger(__name__)


 class DebertaV2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
    DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the DeBERTa
    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 128100):
            Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`DebertaV2Model`].
        hidden_size (`int`, *optional*, defaults to 1536):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 24):
            Number of attention heads for each attention layer in the Transformer encoder.
        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
            are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (`int`, *optional*, defaults to 0):
            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
            The epsilon used by the layer normalization layers.
        relative_attention (`bool`, *optional*, defaults to `True`):
            Whether use relative position encoding.
        max_relative_positions (`int`, *optional*, defaults to -1):
            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
            as `max_position_embeddings`.
        pad_token_id (`int`, *optional*, defaults to 0):
            The value used to pad input_ids.
        position_biased_input (`bool`, *optional*, defaults to `False`):
            Whether add absolute position embedding to content embedding.
        pos_att_type (`List[str]`, *optional*):
            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
            `["p2c", "c2p"]`, `["p2c", "c2p"]`.
        layer_norm_eps (`float`, optional, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
    """
    model_type = 'deberta_v2'

    def __init__(self,
                 vocab_size=128100,
                 hidden_size=1536,
                 num_hidden_layers=24,
                 num_attention_heads=24,
                 intermediate_size=6144,
                 hidden_act='gelu',
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=0,
                 initializer_range=0.02,
                 layer_norm_eps=1e-7,
                 relative_attention=False,
                 max_relative_positions=-1,
                 pad_token_id=0,
                 position_biased_input=True,
                 pos_att_type=None,
                 pooler_dropout=0,
                 pooler_hidden_act='gelu',
                 **kwargs):
        super().__init__(**kwargs)

        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.relative_attention = relative_attention
        self.max_relative_positions = max_relative_positions
        self.pad_token_id = pad_token_id
        self.position_biased_input = position_biased_input

        # Backwards compatibility
        if type(pos_att_type) == str:
            pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')]

        self.pos_att_type = pos_att_type
        self.vocab_size = vocab_size
        self.layer_norm_eps = layer_norm_eps

        self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size)
        self.pooler_dropout = pooler_dropout
        self.pooler_hidden_act = pooler_hidden_act
--- a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
--- a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
@@ -0,0 +1,546 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright 2020 Microsoft and the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`"""

 import os
 import unicodedata
 from typing import Any, Dict, List, Optional, Tuple

 import sentencepiece as sp
 from transformers.tokenization_utils import PreTrainedTokenizer

 PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}

 PRETRAINED_INIT_CONFIGURATION = {}

 VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'}


 class DebertaV2Tokenizer(PreTrainedTokenizer):
    r"""
    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
    and [jieba](https://github.com/fxsjy/jieba).

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.
        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. When building a sequence using special tokens, this is not the token that is
            used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 split_by_punct=False,
                 split_chinese=True,
                 bos_token='[CLS]',
                 eos_token='[SEP]',
                 unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]',
                 cls_token='[CLS]',
                 mask_token='[MASK]',
                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
                 **kwargs) -> None:
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        super().__init__(
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            split_by_punct=split_by_punct,
            split_chinese=split_chinese,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                ' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
            )
        self.do_lower_case = do_lower_case
        self.split_by_punct = split_by_punct
        self.split_chinese = split_chinese
        self.vocab_file = vocab_file
        self._tokenizer = SPMTokenizer(
            vocab_file,
            split_by_punct=split_by_punct,
            sp_model_kwargs=self.sp_model_kwargs)
        self.jieba = None
        if self.split_chinese:
            try:
                import jieba
            except ImportError:
                raise ImportError(
                    'You need to install jieba to split chinese and use DebertaV2Tokenizer. '
                    'See https://pypi.org/project/jieba/ for installation.')
            self.jieba = jieba

    @property
    def vocab_size(self):
        return len(self.vocab)

    @property
    def vocab(self):
        return self._tokenizer.vocab

    def get_vocab(self):
        vocab = self.vocab.copy()
        vocab.update(self.get_added_vocab())
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
        if self.do_lower_case:
            text = text.lower()
        if self.split_chinese:
            seg_list = [x for x in self.jieba.cut(text)]
            text = ' '.join(seg_list)
        return self._tokenizer.tokenize(text)

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self._tokenizer.spm.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self._tokenizer.spm.IdToPiece(
            index) if index < self.vocab_size else self.unk_token

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        return self._tokenizer.decode(tokens)

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A DeBERTa sequence has the following format:

        - single sequence: [CLS] X [SEP]
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True)

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + (
                [0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
                                                        + sep) * [1]

    def prepare_for_tokenization(self,
                                 text,
                                 is_split_into_words=False,
                                 **kwargs):
        add_prefix_space = kwargs.pop('add_prefix_space', False)
        if is_split_into_words or add_prefix_space:
            text = ' ' + text
        return (text, kwargs)

    def save_vocabulary(self,
                        save_directory: str,
                        filename_prefix: Optional[str] = None) -> Tuple[str]:
        return self._tokenizer.save_pretrained(
            save_directory, filename_prefix=filename_prefix)


 class SPMTokenizer:
    r"""
    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

    def __init__(self,
                 vocab_file,
                 split_by_punct=False,
                 sp_model_kwargs: Optional[Dict[str, Any]] = None):
        self.split_by_punct = split_by_punct
        self.vocab_file = vocab_file
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        if not os.path.exists(vocab_file):
            raise FileNotFoundError(f'{vocab_file} does not exist!')
        spm.load(vocab_file)
        bpe_vocab_size = spm.GetPieceSize()
        # Token map
        # <unk> 0+1
        # <s> 1+1
        # </s> 2+1
        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
        # self.vocab['[PAD]'] = 0
        # self.vocab['[CLS]'] = 1
        # self.vocab['[SEP]'] = 2
        # self.vocab['[UNK]'] = 3

        self.spm = spm

    def __getstate__(self):
        state = self.__dict__.copy()
        state['spm'] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d

        # for backward compatibility
        if not hasattr(self, 'sp_model_kwargs'):
            self.sp_model_kwargs = {}

        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        self.spm.Load(self.vocab_file)

    def tokenize(self, text):
        return self._encode_as_pieces(text)

    def convert_ids_to_tokens(self, ids):
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens

    def decode(self, tokens, start=-1, end=-1, raw_text=None):
        if raw_text is None:
            return self.spm.decode_pieces([t for t in tokens])
        else:
            words = self.split_to_words(raw_text)
            word_tokens = [self.tokenize(w) for w in words]
            token2words = [0] * len(tokens)
            tid = 0
            for i, w in enumerate(word_tokens):
                for k, t in enumerate(w):
                    token2words[tid] = i
                    tid += 1
            word_start = token2words[start]
            word_end = token2words[end] if end < len(tokens) else len(words)
            text = ''.join(words[word_start:word_end])
            return text

    def add_special_token(self, token):
        if token not in self.special_tokens:
            self.special_tokens.append(token)
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab) - 1
                self.ids_to_tokens.append(token)
        return self.id(token)

    def part_of_whole_word(self, token, is_bos=False):
        if is_bos:
            return True
        if (len(token) == 1 and (_is_whitespace(list(token)[0]))):
            return False
        if _is_control(list(token)[0]):
            return False
        if _is_punctuation(list(token)[0]):
            return False
        if token in self.add_special_token:
            return False

        word_start = b'\xe2\x96\x81'.decode('utf-8')
        return not token.startswith(word_start)

    def pad(self):
        return '[PAD]'

    def bos(self):
        return '[CLS]'

    def eos(self):
        return '[SEP]'

    def unk(self):
        return '[UNK]'

    def mask(self):
        return '[MASK]'

    def sym(self, id):
        return self.ids_to_tokens[id]

    def id(self, sym):
        return self.vocab[sym] if sym in self.vocab else 1

    def _encode_as_pieces(self, text):
        text = convert_to_unicode(text)
        if self.split_by_punct:
            words = self._run_split_on_punc(text)
            pieces = [self.spm.encode(w, out_type=str) for w in words]
            return [p for w in pieces for p in w]
        else:
            return self.spm.encode(text, out_type=str)

    def split_to_words(self, text):
        pieces = self._encode_as_pieces(text)
        word_start = b'\xe2\x96\x81'.decode('utf-8')
        words = []
        offset = 0
        prev_end = 0
        for i, p in enumerate(pieces):
            if p.startswith(word_start):
                if offset > prev_end:
                    words.append(text[prev_end:offset])
                prev_end = offset
                w = p.replace(word_start, '')
            else:
                w = p
            try:
                s = text.index(w, offset)
                pn = ''
                k = i + 1
                while k < len(pieces):
                    pn = pieces[k].replace(word_start, '')
                    if len(pn) > 0:
                        break
                    k += 1

                if len(pn) > 0 and pn in text[offset:s]:
                    offset = offset + 1
                else:
                    offset = s + len(w)
            except Exception:
                offset = offset + 1

        if prev_end < offset:
            words.append(text[prev_end:offset])

        return words

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize('NFD', text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == 'Mn':
                continue
            output.append(char)
        return ''.join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return [''.join(x) for x in output]

    def save_pretrained(self, path: str, filename_prefix: str = None):
        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
        if filename_prefix is not None:
            filename = filename_prefix + '-' + filename
        full_path = os.path.join(path, filename)
        with open(full_path, 'wb') as fs:
            fs.write(self.spm.serialized_model_proto())
        return (full_path, )


 def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically control characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
        return True
    cat = unicodedata.category(char)
    if cat == 'Zs':
        return True
    return False


 def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == '\t' or char == '\n' or char == '\r':
        return False
    cat = unicodedata.category(char)
    if cat.startswith('C'):
        return True
    return False


 def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    cat = unicodedata.category(char)
    if cat.startswith('P'):
        return True
    return False


 def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode('utf-8', 'ignore')
    else:
        raise ValueError(f'Unsupported string type: {type(text)}')
--- a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
@@ -0,0 +1,241 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright 2020 Microsoft and the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Tokenization class for model DeBERTa."""

 import os
 from shutil import copyfile
 from typing import Optional, Tuple

 from transformers.file_utils import is_sentencepiece_available
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

 from modelscope.utils import logger as logging

 if is_sentencepiece_available():
    from .tokenization_deberta_v2 import DebertaV2Tokenizer
 else:
    DebertaV2Tokenizer = None

 logger = logging.get_logger(__name__)

 VOCAB_FILES_NAMES = {
    'vocab_file': 'spm.model',
    'tokenizer_file': 'tokenizer.json'
 }

 PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}

 PRETRAINED_INIT_CONFIGURATION = {}


 class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
    r"""
    Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
    and [rjieba-py](https://github.com/messense/rjieba-py).

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.
        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
            The end of sequence token. When building a sequence using special tokens, this is not the token that is
            used for the end of sequence. The token used is the `sep_token`.
        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = DebertaV2Tokenizer

    def __init__(self,
                 vocab_file=None,
                 tokenizer_file=None,
                 do_lower_case=False,
                 split_by_punct=False,
                 split_chinese=True,
                 bos_token='[CLS]',
                 eos_token='[SEP]',
                 unk_token='[UNK]',
                 sep_token='[SEP]',
                 pad_token='[PAD]',
                 cls_token='[CLS]',
                 mask_token='[MASK]',
                 **kwargs) -> None:
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            split_by_punct=split_by_punct,
            split_chinese=split_chinese,
            **kwargs,
        )

        self.do_lower_case = do_lower_case
        self.split_by_punct = split_by_punct
        self.split_chinese = split_chinese
        self.vocab_file = vocab_file
        self.can_save_slow_tokenizer = False if not self.vocab_file else True

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A DeBERTa sequence has the following format:

        - single sequence: [CLS] X [SEP]
        - pair of sequences: [CLS] A [SEP] B [SEP]

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self,
                                token_ids_0,
                                token_ids_1=None,
                                already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True)

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + (
                [0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self,
                                             token_ids_0,
                                             token_ids_1=None):
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
                                                        + sep) * [1]

    def save_vocabulary(self,
                        save_directory: str,
                        filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
                'tokenizer.')

        if not os.path.isdir(save_directory):
            logger.error(
                f'Vocabulary path ({save_directory}) should be a directory')
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + '-' if filename_prefix else '')
            + VOCAB_FILES_NAMES['vocab_file'])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file, )
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -339,5 +339,9 @@ class GPT3Model(PreTrainedModel):
        state_dict_file = os.path.join(pretrained_model_name_or_path,
                                       ModelFile.TORCH_MODEL_BIN_FILE)
        state_dict = torch.load(state_dict_file)
        state_dict = {
            k.replace('model.language_model', 'language_model'): v
            for k, v in state_dict.items()
        }
        model.load_state_dict(state_dict)
        return model
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -6,6 +6,8 @@ from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.deberta_v2 import \
    DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
 from modelscope.models.nlp.structbert import SbertForMaskedLM
 from modelscope.models.nlp.veco import \
    VecoForMaskedLM as VecoForMaskedLMTransformer
@@ -125,3 +127,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
                     VecoForMaskedLM).from_pretrained(
                         pretrained_model_name_or_path=model_dir,
                         model_dir=model_dir)


@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
 class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
    """Deberta v2 for MLM model.

    Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
    """

    def __init__(self, config, model_dir):
        super(TorchModel, self).__init__(model_dir)
        DebertaV2ForMaskedLMTransformer.__init__(self, config)

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                labels=None):
        output = DebertaV2ForMaskedLMTransformer.forward(
            self,
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            labels=labels)
        output[OutputKeys.INPUT_IDS] = input_ids
        return output

    @classmethod
    def _instantiate(cls, **kwargs):
        model_dir = kwargs.get('model_dir')
        return super(DebertaV2ForMaskedLMTransformer,
                     DebertaV2ForMaskedLM).from_pretrained(
                         pretrained_model_name_or_path=model_dir,
                         model_dir=model_dir)
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -592,11 +592,11 @@ class AbsSummarizer(PalmPreTrainedModel):  # Model
        self.generator.dense.weight = self.decoder.embeddings.weight

        if checkpoint is not None:
            for key in list(checkpoint['model'].keys()):
                checkpoint['model'][key.replace('module.',
                                                '')] = checkpoint['model'][key]
            msg = self.load_state_dict(checkpoint['model'], strict=False)
            print(msg)
            if 'model' in checkpoint:
                checkpoint = checkpoint['model']
            for key in list(checkpoint.keys()):
                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
            self.load_state_dict(checkpoint, strict=False)
        else:
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -734,7 +734,7 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
        return addict.Dict(loss=loss)


 class Translator(nn.Module):
 class Translator(object):
    """
    Uses a model to translate a batch of sentences.
    """
@@ -1298,8 +1298,8 @@ class Translator(nn.Module):

        return results

    def forward(self, input_ids: torch.Tensor,
                attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
    def __call__(self, input_ids: torch.Tensor,
                 attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
        batch = self.Batch(
            batch_size=input_ids.size()[0],
            src=input_ids,
--- a/modelscope/msdatasets/cv/face_2d_keypoins/init.py
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/init.py
@@ -0,0 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .face_2d_keypoints_dataset import FaceKeypointDataset

 else:
    _import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']}

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
@@ -0,0 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset

 from modelscope.metainfo import Datasets
 from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
 from modelscope.utils.constant import Tasks


@TASK_DATASETS.register_module(
    group_key=Tasks.face_2d_keypoints,
    module_name=Datasets.Face2dKeypointsDataset)
 class FaceKeypointDataset(_FaceKeypointDataset):
    """EasyCV dataset for face 2d keypoints."""
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -70,12 +70,12 @@ class MsIterableDataset(torch.utils.data.IterableDataset):
        for idx in range(iter_start, iter_end):
            item_dict = self.dataset[idx]
            res = {
                k: np.array(item_dict[k])
                k: torch.tensor(item_dict[k])
                for k in self.columns if k in self.retained_columns
            }
            for preprocessor in self.preprocessor_list:
                res.update({
                    k: np.array(v)
                    k: torch.tensor(v)
                    for k, v in preprocessor(item_dict).items()
                    if k in self.retained_columns
                })
@@ -574,14 +574,8 @@ class MsDataset:
            None

        """
        from modelscope.hub.api import HubApi
        _hub_api = HubApi()
        cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
        _upload_manager = DatasetUploadManager(
            dataset_name=dataset_name,
            namespace=namespace,
            version=version,
            cookies=cookies)
            dataset_name=dataset_name, namespace=namespace, version=version)
        _upload_manager.upload(object_name, local_file_path)

    @staticmethod
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -18,6 +18,12 @@ class OssUtilities:
        self.oss_dir = oss_config['Dir']
        self.oss_backup_dir = oss_config['BackupDir']

        self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset'
        self.upload_multipart_threshold = 50 * 1024 * 1024
        self.upload_part_size = 1 * 1024 * 1024
        self.upload_num_threads = 4
        self.upload_max_retries = 3

    @staticmethod
    def _percentage(consumed_bytes, total_bytes):
        if total_bytes:
@@ -42,21 +48,27 @@ class OssUtilities:
                progress_callback=self._percentage)
        return local_path

    def upload(self, oss_file_name: str, local_file_path: str) -> str:
        max_retries = 3
    def upload(self, oss_object_name: str, local_file_path: str) -> str:
        retry_count = 0
        object_key = os.path.join(self.oss_dir, oss_file_name)
        object_key = os.path.join(self.oss_dir, oss_object_name)
        resumable_store = oss2.ResumableStore(
            root=self.upload_resumable_tmp_store)

        while True:
            try:
                retry_count += 1
                self.bucket.put_object_from_file(
                oss2.resumable_upload(
                    self.bucket,
                    object_key,
                    local_file_path,
                    progress_callback=self._percentage)
                    store=resumable_store,
                    multipart_threshold=self.upload_multipart_threshold,
                    part_size=self.upload_part_size,
                    progress_callback=self._percentage,
                    num_threads=self.upload_num_threads)
                break
            except Exception:
                if retry_count >= max_retries:
                if retry_count >= self.upload_max_retries:
                    raise

        return object_key
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,23 +1,21 @@
 from http.cookiejar import CookieJar

 from .oss_utils import OssUtilities


 class DatasetUploadManager(object):

    def __init__(self, dataset_name: str, namespace: str, version: str,
                 cookies: CookieJar):
    def __init__(self, dataset_name: str, namespace: str, version: str):
        from modelscope.hub.api import HubApi
        api = HubApi()
        oss_config = api.get_dataset_access_config_session(
            cookies=cookies,
        _hub_api = HubApi()
        _cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
        _oss_config = _hub_api.get_dataset_access_config_session(
            cookies=_cookies,
            dataset_name=dataset_name,
            namespace=namespace,
            revision=version)

        self.oss_utilities = OssUtilities(oss_config)
        self.oss_utilities = OssUtilities(_oss_config)

    def upload(self, oss_file_name: str, local_file_path: str) -> str:
        oss_object_key = self.oss_utilities.upload(
            oss_file_name=oss_file_name, local_file_path=local_file_path)
        return oss_object_key
    def upload(self, object_name: str, local_file_path: str) -> str:
        object_key = self.oss_utilities.upload(
            oss_object_name=object_name, local_file_path=local_file_path)
        return object_key
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -35,6 +35,7 @@ class OutputKeys(object):
    UUID = 'uuid'
    WORD = 'word'
    KWS_LIST = 'kws_list'
    TIMESTAMPS = 'timestamps'
    SPLIT_VIDEO_NUM = 'split_video_num'
    SPLIT_META_DICT = 'split_meta_dict'

@@ -56,6 +57,15 @@ TASK_OUTPUTS = {
    # }
    Tasks.ocr_recognition: [OutputKeys.TEXT],

    # face 2d keypoint result for single sample
    #   {
    #       "keypoints": [
    #           [x1, y1]*106
    #       ],
    #       "poses": [pitch, roll, yaw]
    #   }
    Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],

    # face detection result for single sample
    #   {
    #       "scores": [0.9, 0.1, 0.05, 0.05]
@@ -75,6 +85,14 @@ TASK_OUTPUTS = {
    Tasks.face_detection:
    [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],

    # facial expression recognition result for single sample
    #   {
    #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
    #       "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
    #   }
    Tasks.facial_expression_recognition:
    [OutputKeys.SCORES, OutputKeys.LABELS],

    # face recognition result for single sample
    #   {
    #       "img_embedding": np.array with shape [1, D],
@@ -201,6 +219,21 @@ TASK_OUTPUTS = {
    # }
    Tasks.body_3d_keypoints: [OutputKeys.POSES],

    # 2D hand keypoints result for single sample
    # {
    #     "keypoints": [
    #                     [[x, y, score] * 21],
    #                     [[x, y, score] * 21],
    #                     [[x, y, score] * 21],
    #                  ],
    #     "boxes": [
    #                 [x1, y1, x2, y2],
    #                 [x1, y1, x2, y2],
    #                 [x1, y1, x2, y2],
    #             ]
    # }
    Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],

    # video single object tracking result for single video
    # {
    #   "boxes": [
@@ -242,7 +275,20 @@ TASK_OUTPUTS = {
    #    "output_img": np.ndarray with shape [height, width, 3]
    # }
    Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],

    # text driven segmentation result for single sample
    #   {
    #       "masks": [
    #           np.array # 2D array containing only 0, 255
    #       ]
    #   }
    Tasks.text_driven_segmentation: [OutputKeys.MASKS],
    # shop segmentation result for single sample
    #   {
    #       "masks": [
    #           np.array # 2D array containing only 0, 255
    #       ]
    #   }
    Tasks.shop_segmentation: [OutputKeys.MASKS],
    # movide scene segmentation result for a single video
    # {
    #        "split_video_num":3,
@@ -541,6 +587,19 @@ TASK_OUTPUTS = {
    # }
    Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],

    # {
    #     'labels': ['吸烟', '打电话', '吸烟'],
    #     'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
    #     'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
    #     'timestamps': [1, 3, 5]
    # }
    Tasks.action_detection: [
        OutputKeys.TIMESTAMPS,
        OutputKeys.LABELS,
        OutputKeys.SCORES,
        OutputKeys.BOXES,
    ],

    # {
    #   'output': [
    #     [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
@@ -551,6 +610,7 @@ TASK_OUTPUTS = {
    #      {'label': '13421097', 'score': 2.75914817393641e-06}]]
    # }
    Tasks.faq_question_answering: [OutputKeys.OUTPUT],

    # image person reid result for single sample
    #   {
    #       "img_embedding": np.array with shape [1, D],
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -2,7 +2,6 @@

 import os.path as osp
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from threading import Lock
 from typing import Any, Dict, Generator, List, Mapping, Union

--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -71,6 +71,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
    Tasks.action_recognition: (Pipelines.action_recognition,
                               'damo/cv_TAdaConv_action-recognition'),
    Tasks.action_detection: (Pipelines.action_detection,
                             'damo/cv_ResNetC3D_action-detection_detection2d'),
    Tasks.live_category: (Pipelines.live_category,
                          'damo/cv_resnet50_live-category'),
    Tasks.video_category: (Pipelines.video_category,
@@ -97,10 +99,18 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                              'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
    Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
                              'damo/cv_canonical_body-3d-keypoints_video'),
    Tasks.hand_2d_keypoints:
    (Pipelines.hand_2d_keypoints,
     'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
    Tasks.face_detection: (Pipelines.face_detection,
                           'damo/cv_resnet_facedetection_scrfd10gkps'),
    Tasks.face_recognition: (Pipelines.face_recognition,
                             'damo/cv_ir101_facerecognition_cfglint'),
    Tasks.facial_expression_recognition:
    (Pipelines.facial_expression_recognition,
     'damo/cv_vgg19_facial-expression-recognition_fer'),
    Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
                              'damo/cv_mobilenet_face-2d-keypoints_alignment'),
    Tasks.video_multi_modal_embedding:
    (Pipelines.video_multi_modal_embedding,
     'damo/multi_modal_clip_vtretrival_msrvtt_53'),
@@ -147,9 +157,14 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     'damo/cv_vitb_video-single-object-tracking_ostrack'),
    Tasks.image_reid_person: (Pipelines.image_reid_person,
                              'damo/cv_passvitb_image-reid-person_market'),
    Tasks.text_driven_segmentation:
    (Pipelines.text_driven_segmentation,
     'damo/cv_vitl16_segmentation_text-driven-seg'),
    Tasks.movie_scene_segmentation:
    (Pipelines.movie_scene_segmentation,
     'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
     'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
    Tasks.shop_segmentation: (Pipelines.shop_segmentation,
                              'damo/cv_vitb16_segmentation_shop-seg'),
 }


--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .action_recognition_pipeline import ActionRecognitionPipeline
    from .action_detection_pipeline import ActionDetectionPipeline
    from .animal_recognition_pipeline import AnimalRecognitionPipeline
    from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
    from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
    from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline
    from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
    from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
    from .crowd_counting_pipeline import CrowdCountingPipeline
@@ -42,15 +44,21 @@ if TYPE_CHECKING:
    from .tinynas_classification_pipeline import TinynasClassificationPipeline
    from .video_category_pipeline import VideoCategoryPipeline
    from .virtual_try_on_pipeline import VirtualTryonPipeline
    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
    from .shop_segmentation_pipleline import ShopSegmentationPipeline
    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
    from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
    from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
    from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline

 else:
    _import_structure = {
        'action_recognition_pipeline': ['ActionRecognitionPipeline'],
        'action_detection_pipeline': ['ActionDetectionPipeline'],
        'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
        'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
        'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
        'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'],
        'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
        'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
        'crowd_counting_pipeline': ['CrowdCountingPipeline'],
@@ -93,10 +101,18 @@ else:
        'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
        'video_category_pipeline': ['VideoCategoryPipeline'],
        'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
        'easycv_pipeline':
        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
        'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
        'easycv_pipeline': [
            'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
            'Face2DKeypointsPipeline'
        ],
        'text_driven_segmentation_pipeline':
        ['TextDrivenSegmentationPipeline'],
        'movie_scene_segmentation_pipeline':
        ['MovieSceneSegmentationPipeline'],
        'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
        'facial_expression_recognition_pipelin':
        ['FacialExpressionRecognitionPipeline']
    }

    import sys
--- a/modelscope/pipelines/cv/action_detection_pipeline.py
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -0,0 +1,63 @@
 import math
 import os.path as osp
 from typing import Any, Dict

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.action_detection import ActionDetONNX
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.action_detection, module_name=Pipelines.action_detection)
 class ActionDetectionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a action detection pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE)
        logger.info(f'loading model from {model_path}')
        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
        logger.info(f'loading config from {config_path}')
        self.cfg = Config.from_file(config_path)
        self.cfg.MODEL.model_file = model_path
        self.model = ActionDetONNX(self.model, self.cfg.MODEL,
                                   self.device_name)
        logger.info('load model done')

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if isinstance(input, str):
            video_name = input
        else:
            raise TypeError(f'input should be a str,'
                            f'  but got {type(input)}')
        result = {'video_name': video_name}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        preds = self.model.forward(input['video_name'])
        labels = sum([pred['actions']['labels'] for pred in preds], [])
        scores = sum([pred['actions']['scores'] for pred in preds], [])
        boxes = sum([pred['actions']['boxes'] for pred in preds], [])
        timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels'])
                          for pred in preds], [])
        out = {
            OutputKeys.TIMESTAMPS: timestamps,
            OutputKeys.LABELS: labels,
            OutputKeys.SCORES: scores,
            OutputKeys.BOXES: boxes
        }
        return out

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/easycv_pipelines/init.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/init.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .detection_pipeline import EasyCVDetectionPipeline
    from .segmentation_pipeline import EasyCVSegmentationPipeline
    from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
 else:
    _import_structure = {
        'detection_pipeline': ['EasyCVDetectionPipeline'],
        'segmentation_pipeline': ['EasyCVSegmentationPipeline']
        'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
    }

    import sys
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -0,0 +1,41 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from .base import EasyCVPipeline


@PIPELINES.register_module(
    Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints)
 class Face2DKeypointsPipeline(EasyCVPipeline):
    """Pipeline for face 2d keypoints detection."""

    def __init__(self,
                 model: str,
                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
                 *args,
                 **kwargs):
        """
            model (str): model id on modelscope hub or local model path.
            model_file_pattern (str): model file pattern.
        """

        super(Face2DKeypointsPipeline, self).__init__(
            model=model,
            model_file_pattern=model_file_pattern,
            *args,
            **kwargs)

    def show_result(self, img, points, scale=2, save_path=None):
        return self.predict_op.show_result(img, points, scale, save_path)

    def __call__(self, inputs) -> Any:
        output = self.predict_op(inputs)[0][0]
        points = output['point']
        poses = output['pose']

        return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -0,0 +1,128 @@
 import os.path as osp
 from typing import Any, Dict

 import cv2
 import numpy as np
 import PIL
 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_recognition.align_face import align_face
 from modelscope.models.cv.facial_expression_recognition import \
    FacialExpressionRecognition
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.facial_expression_recognition,
    module_name=Pipelines.facial_expression_recognition)
 class FacialExpressionRecognitionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a face detection pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
        logger.info(f'loading model from {ckpt_path}')
        device = torch.device(
            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
        fer = FacialExpressionRecognition(model_path=ckpt_path, device=device)
        self.fer = fer
        self.device = device
        logger.info('load model done')

        # face detect pipeline
        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
        self.face_detection = pipeline(
            Tasks.face_detection, model=det_model_id)

    def _choose_face(self,
                     det_result,
                     min_face=10,
                     top_face=1,
                     center_face=False):
        '''
        choose face with maximum area
        Args:
            det_result: output of face detection pipeline
            min_face: minimum size of valid face w/h
            top_face: take faces with top max areas
            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
        '''
        bboxes = np.array(det_result[OutputKeys.BOXES])
        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
        if bboxes.shape[0] == 0:
            logger.info('Warning: No face detected!')
            return None
        # face idx with enough size
        face_idx = []
        for i in range(bboxes.shape[0]):
            box = bboxes[i]
            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
                face_idx += [i]
        if len(face_idx) == 0:
            logger.info(
                f'Warning: Face size not enough, less than {min_face}x{min_face}!'
            )
            return None
        bboxes = bboxes[face_idx]
        landmarks = landmarks[face_idx]
        # find max faces
        boxes = np.array(bboxes)
        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        sort_idx = np.argsort(area)[-top_face:]
        # find center face
        if top_face > 1 and center_face and bboxes.shape[0] > 1:
            img_center = [img.shape[1] // 2, img.shape[0] // 2]
            min_dist = float('inf')
            sel_idx = -1
            for _idx in sort_idx:
                box = boxes[_idx]
                dist = np.square(
                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
                if dist < min_dist:
                    min_dist = dist
                    sel_idx = _idx
            sort_idx = [sel_idx]
        main_idx = sort_idx[-1]
        return bboxes[main_idx], landmarks[main_idx]

    def preprocess(self, input: Input) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input)
        img = img[:, :, ::-1]
        det_result = self.face_detection(img.copy())
        rtn = self._choose_face(det_result)
        face_img = None
        if rtn is not None:
            _, face_lmks = rtn
            face_lmks = face_lmks.reshape(5, 2)
            face_img, _ = align_face(img, (112, 112), face_lmks)
            face_img = face_img.astype(np.float32)
        result = {}
        result['img'] = face_img
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        result = self.fer(input)
        assert result is not None
        scores = result[0].tolist()
        labels = result[1].tolist()
        return {
            OutputKeys.SCORES: scores,
            OutputKeys.LABELS: labels,
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -0,0 +1,51 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path

 from modelscope.metainfo import Pipelines
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import ModelFile, Tasks
 from .easycv_pipelines.base import EasyCVPipeline


@PIPELINES.register_module(
    Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints)
 class Hand2DKeypointsPipeline(EasyCVPipeline):
    """Pipeline for hand pose keypoint task."""

    def __init__(self,
                 model: str,
                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
                 *args,
                 **kwargs):
        """
            model (str): model id on modelscope hub or local model path.
            model_file_pattern (str): model file pattern.
        """
        self.model_dir = model
        super(Hand2DKeypointsPipeline, self).__init__(
            model=model,
            model_file_pattern=model_file_pattern,
            *args,
            **kwargs)

    def _build_predict_op(self):
        """Build EasyCV predictor."""
        from easycv.predictors.builder import build_predictor
        detection_predictor_type = self.cfg['DETECTION']['type']
        detection_model_path = os.path.join(
            self.model_dir, self.cfg['DETECTION']['model_path'])
        detection_cfg_file = os.path.join(self.model_dir,
                                          self.cfg['DETECTION']['config_file'])
        detection_score_threshold = self.cfg['DETECTION']['score_threshold']
        self.cfg.pipeline.predictor_config[
            'detection_predictor_config'] = dict(
                type=detection_predictor_type,
                model_path=detection_model_path,
                config_file=detection_cfg_file,
                score_threshold=detection_score_threshold)
        easycv_config = self._to_easycv_config()
        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
            'model_path': self.model_path,
            'config_file': easycv_config
        })
        return pipeline_op
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -149,6 +149,8 @@ class OCRDetectionPipeline(Pipeline):
    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        rboxes = inputs['combined_rboxes'][0]
        count = inputs['combined_counts'][0]
        if count == 0 or count < rboxes.shape[0]:
            raise Exception('modelscope error: No text detected')
        rboxes = rboxes[:count, :]

        # convert rboxes to polygons and find its coordinates on the original image
--- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -0,0 +1,58 @@
 import os.path as osp
 from typing import Any, Dict

 import cv2
 import numpy as np
 import PIL
 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_detection import RetinaFaceDetection
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.face_detection, module_name=Pipelines.retina_face_detection)
 class RetinaFaceDetectionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a face detection pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
        logger.info(f'loading model from {ckpt_path}')
        detector = RetinaFaceDetection(
            model_path=ckpt_path, device=self.device)
        self.detector = detector
        logger.info('load model done')

    def preprocess(self, input: Input) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input)
        img = img.astype(np.float32)
        result = {'img': img}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        result = self.detector(input)
        assert result is not None
        bboxes = result[0][:, :4].tolist()
        scores = result[0][:, 4].tolist()
        lms = result[1].tolist()
        return {
            OutputKeys.SCORES: scores,
            OutputKeys.BOXES: bboxes,
            OutputKeys.KEYPOINTS: lms,
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/shop_segmentation_pipleline.py
+++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
@@ -0,0 +1,51 @@
 from typing import Any, Dict

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
    Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation)
 class ShopSegmentationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
            model: model id on modelscope hub.
        """
        super().__init__(model=model, auto_collate=False, **kwargs)

    def preprocess(self, input: Input) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input)
        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
        result = {
            'img': img_tensor,
            'ori_h': ori_h,
            'ori_w': ori_w,
            'crop_h': crop_h,
            'crop_w': crop_w
        }
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:

        outputs = self.model.inference(input['img'])
        result = {
            'data': outputs,
            'ori_h': input['ori_h'],
            'ori_w': input['ori_w'],
            'crop_h': input['crop_h'],
            'crop_w': input['crop_w'],
        }
        return result

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
                                      inputs['crop_w'], inputs['ori_h'],
                                      inputs['ori_w'])
        outputs = {OutputKeys.MASKS: data}
        return outputs
--- a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
+++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
@@ -0,0 +1,51 @@
 from typing import Any, Dict

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
    Tasks.text_driven_segmentation,
    module_name=Pipelines.text_driven_segmentation)
 class TextDrivenSegmentationPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
            model: model id on modelscope hub.
        """
        super().__init__(model=model, auto_collate=False, **kwargs)

    def preprocess(self, input: Dict) -> Dict[str, Any]:
        img = LoadImage.convert_to_ndarray(input['image'])
        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
        result = {
            'img': img_tensor,
            'ori_h': ori_h,
            'ori_w': ori_w,
            'crop_h': crop_h,
            'crop_w': crop_w,
            'text': input['text'],
        }
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        outputs = self.model.inference(input['img'], input['text'])
        result = {
            'data': outputs,
            'ori_h': input['ori_h'],
            'ori_w': input['ori_w'],
            'crop_h': input['crop_h'],
            'crop_w': input['crop_w'],
        }
        return result

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
                                      inputs['crop_w'], inputs['ori_h'],
                                      inputs['ori_w'])
        outputs = {OutputKeys.MASKS: data}
        return outputs