From 291f8fe68c3462abc6462c5e408e7f349203f630 Mon Sep 17 00:00:00 2001
From: "lllcho.lc" <lllcho.lc@alibaba-inc.com>
Date: Thu, 1 Sep 2022 18:14:37 +0800
Subject: [PATCH 01/28] [to #42322933] Add action-detection model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加新的action-detection task
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9898947
---
 .../videos/action_detection_test_video.mp4    |   3 +
 modelscope/metainfo.py                        |   1 +
 .../models/cv/action_detection/__init__.py    |  21 +++
 .../action_detection/action_detection_onnx.py | 177 ++++++++++++++++++
 modelscope/outputs.py                         |  15 ++
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../pipelines/cv/action_detection_pipeline.py |  63 +++++++
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_action_detection.py      |  22 +++
 10 files changed, 307 insertions(+)
 create mode 100644 data/test/videos/action_detection_test_video.mp4
 create mode 100644 modelscope/models/cv/action_detection/__init__.py
 create mode 100644 modelscope/models/cv/action_detection/action_detection_onnx.py
 create mode 100644 modelscope/pipelines/cv/action_detection_pipeline.py
 create mode 100644 tests/pipelines/test_action_detection.py

diff --git a/data/test/videos/action_detection_test_video.mp4 b/data/test/videos/action_detection_test_video.mp4
new file mode 100644
index 00000000..e2ea1d80
--- /dev/null
+++ b/data/test/videos/action_detection_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b7c3bc7c82ea5fee9d83130041df01046d89143ff77058b04577455ff6fdc92
+size 3191059
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 6f34b1a3..7c5afe80 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -133,6 +133,7 @@ class Pipelines(object):
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
     crowd_counting = 'hrnet-crowd-counting'
+    action_detection = 'ResNetC3D-action-detection'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
     image_panoptic_segmentation = 'image-panoptic-segmentation'
     video_summarization = 'googlenet_pgl_video_summarization'
diff --git a/modelscope/models/cv/action_detection/__init__.py b/modelscope/models/cv/action_detection/__init__.py
new file mode 100644
index 00000000..fedbe19c
--- /dev/null
+++ b/modelscope/models/cv/action_detection/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .action_detection_onnx import ActionDetONNX
+
+else:
+    _import_structure = {'action_detection_onnx': ['ActionDetONNX']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py
new file mode 100644
index 00000000..3c171473
--- /dev/null
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -0,0 +1,177 @@
+import os
+import os.path as osp
+import shutil
+import subprocess
+
+import cv2
+import numpy as np
+import onnxruntime as rt
+
+from modelscope.models import Model
+from modelscope.utils.constant import Devices
+from modelscope.utils.device import verify_device
+
+
+class ActionDetONNX(Model):
+
+    def __init__(self, model_dir, config, *args, **kwargs):
+        super().__init__(self, model_dir, *args, **kwargs)
+        model_file = osp.join(config['model_file'])
+        device_type, device_id = verify_device(self._device_name)
+        options = rt.SessionOptions()
+        options.intra_op_num_threads = 1
+        options.inter_op_num_threads = 1
+        if device_type == Devices.gpu:
+            sess = rt.InferenceSession(
+                model_file,
+                providers=['CUDAExecutionProvider'],
+                sess_options=options,
+                provider_options=[{
+                    'device_id': device_id
+                }])
+        else:
+            sess = rt.InferenceSession(
+                model_file,
+                providers=['CPUExecutionProvider'],
+                sess_options=options)
+        self.input_name = sess.get_inputs()[0].name
+        self.sess = sess
+        self.num_stride = len(config['fpn_strides'])
+        self.score_thresh = np.asarray(
+            config['pre_nms_thresh'], dtype='float32').reshape((1, -1))
+        self.size_divisibility = config['size_divisibility']
+        self.nms_threshold = config['nms_thresh']
+        self.tmp_dir = config['tmp_dir']
+        self.temporal_stride = config['step']
+        self.input_data_type = config['input_type']
+        self.action_names = config['action_names']
+        self.video_length_limit = config['video_length_limit']
+
+    def resize_box(self, det, height, width, scale_h, scale_w):
+        bboxs = det[0]
+        bboxs[:, [0, 2]] *= scale_w
+        bboxs[:, [1, 3]] *= scale_h
+        bboxs[:, [0, 2]] = bboxs[:, [0, 2]].clip(0, width - 1)
+        bboxs[:, [1, 3]] = bboxs[:, [1, 3]].clip(0, height - 1)
+        result = {
+            'boxes': bboxs.round().astype('int32').tolist(),
+            'scores': det[1].tolist(),
+            'labels': [self.action_names[i] for i in det[2].tolist()]
+        }
+        return result
+
+    def parse_frames(self, frame_names):
+        imgs = [cv2.imread(name)[:, :, ::-1] for name in frame_names]
+        imgs = np.stack(imgs).astype(self.input_data_type).transpose(
+            (3, 0, 1, 2))  # c,t,h,w
+        imgs = imgs[None]
+        return imgs
+
+    def forward_img(self, imgs, h, w):
+        pred = self.sess.run(None, {
+            self.input_name: imgs,
+            'height': np.asarray(h),
+            'width': np.asarray(w)
+        })
+        dets = self.post_nms(
+            pred,
+            score_threshold=self.score_thresh,
+            nms_threshold=self.nms_threshold)
+        return dets
+
+    def forward_video(self, video_name, scale):
+        min_size, max_size = self._get_sizes(scale)
+
+        tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4])
+        if osp.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        os.makedirs(tmp_dir)
+        frame_rate = 2
+        cmd = f'ffmpeg -y -loglevel quiet -ss 0 -t {self.video_length_limit}' + \
+              f' -i {video_name} -r {frame_rate} -f image2 {tmp_dir}/%06d.jpg'
+
+        cmd = cmd.split(' ')
+        subprocess.call(cmd)
+
+        frame_names = [
+            osp.join(tmp_dir, name) for name in sorted(os.listdir(tmp_dir))
+            if name.endswith('.jpg')
+        ]
+        frame_names = [
+            frame_names[i:i + frame_rate * 2]
+            for i in range(0,
+                           len(frame_names) - frame_rate * 2 + 1, frame_rate
+                           * self.temporal_stride)
+        ]
+        timestamp = list(
+            range(1,
+                  len(frame_names) * self.temporal_stride,
+                  self.temporal_stride))
+        batch_imgs = [self.parse_frames(names) for names in frame_names]
+
+        N, _, T, H, W = batch_imgs[0].shape
+        scale_min = min_size / min(H, W)
+        h, w = min(int(scale_min * H),
+                   max_size), min(int(scale_min * W), max_size)
+        h = round(h / self.size_divisibility) * self.size_divisibility
+        w = round(w / self.size_divisibility) * self.size_divisibility
+        scale_h, scale_w = H / h, W / w
+
+        results = []
+        for imgs in batch_imgs:
+            det = self.forward_img(imgs, h, w)
+            det = self.resize_box(det[0], H, W, scale_h, scale_w)
+            results.append(det)
+        results = [{
+            'timestamp': t,
+            'actions': res
+        } for t, res in zip(timestamp, results)]
+        shutil.rmtree(tmp_dir)
+        return results
+
+    def forward(self, video_name):
+        return self.forward_video(video_name, scale=1)
+
+    def post_nms(self, pred, score_threshold, nms_threshold=0.3):
+        pred_bboxes, pred_scores = pred
+        N = len(pred_bboxes)
+        dets = []
+        for i in range(N):
+            bboxes, scores = pred_bboxes[i], pred_scores[i]
+            candidate_inds = scores > score_threshold
+            scores = scores[candidate_inds]
+            candidate_nonzeros = candidate_inds.nonzero()
+            bboxes = bboxes[candidate_nonzeros[0]]
+            labels = candidate_nonzeros[1]
+            keep = self._nms(bboxes, scores, labels, nms_threshold)
+            bbox = bboxes[keep]
+            score = scores[keep]
+            label = labels[keep]
+            dets.append((bbox, score, label))
+        return dets
+
+    def _nms(self, boxes, scores, idxs, nms_threshold):
+        if len(boxes) == 0:
+            return []
+        max_coordinate = boxes.max()
+        offsets = idxs * (max_coordinate + 1)
+        boxes_for_nms = boxes + offsets[:, None].astype('float32')
+        boxes_for_nms[:, 2] = boxes_for_nms[:, 2] - boxes_for_nms[:, 0]
+        boxes_for_nms[:, 3] = boxes_for_nms[:, 3] - boxes_for_nms[:, 1]
+        keep = cv2.dnn.NMSBoxes(
+            boxes_for_nms.tolist(),
+            scores.tolist(),
+            score_threshold=0,
+            nms_threshold=nms_threshold)
+        if len(keep.shape) == 2:
+            keep = np.squeeze(keep, 1)
+        return keep
+
+    def _get_sizes(self, scale):
+        if scale == 1:
+            min_size, max_size = 512, 896
+        elif scale == 2:
+            min_size, max_size = 768, 1280
+        else:
+            min_size, max_size = 1024, 1792
+        return min_size, max_size
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index aebb9138..7d6cdb59 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -35,6 +35,7 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    TIMESTAMPS = 'timestamps'
     SPLIT_VIDEO_NUM = 'split_video_num'
     SPLIT_META_DICT = 'split_meta_dict'
 
@@ -541,6 +542,19 @@ TASK_OUTPUTS = {
     # }
     Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
 
+    # {
+    #     'labels': ['吸烟', '打电话', '吸烟'],
+    #     'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
+    #     'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
+    #     'timestamps': [1, 3, 5]
+    # }
+    Tasks.action_detection: [
+        OutputKeys.TIMESTAMPS,
+        OutputKeys.LABELS,
+        OutputKeys.SCORES,
+        OutputKeys.BOXES,
+    ],
+
     # {
     #   'output': [
     #     [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
@@ -551,6 +565,7 @@ TASK_OUTPUTS = {
     #      {'label': '13421097', 'score': 2.75914817393641e-06}]]
     # }
     Tasks.faq_question_answering: [OutputKeys.OUTPUT],
+
     # image person reid result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 8a1a3646..c9f0c252 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -71,6 +71,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
     Tasks.action_recognition: (Pipelines.action_recognition,
                                'damo/cv_TAdaConv_action-recognition'),
+    Tasks.action_detection: (Pipelines.action_detection,
+                             'damo/cv_ResNetC3D_action-detection_detection2d'),
     Tasks.live_category: (Pipelines.live_category,
                           'damo/cv_resnet50_live-category'),
     Tasks.video_category: (Pipelines.video_category,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 01c69758..f4e6792b 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -5,6 +5,7 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .action_recognition_pipeline import ActionRecognitionPipeline
+    from .action_detection_pipeline import ActionDetectionPipeline
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
     from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
@@ -48,6 +49,7 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
+        'action_detection_pipeline': ['ActionDetectionPipeline'],
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
         'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
diff --git a/modelscope/pipelines/cv/action_detection_pipeline.py b/modelscope/pipelines/cv/action_detection_pipeline.py
new file mode 100644
index 00000000..72335d5b
--- /dev/null
+++ b/modelscope/pipelines/cv/action_detection_pipeline.py
@@ -0,0 +1,63 @@
+import math
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_detection import ActionDetONNX
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.action_detection, module_name=Pipelines.action_detection)
+class ActionDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a action detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_path = osp.join(self.model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.cfg.MODEL.model_file = model_path
+        self.model = ActionDetONNX(self.model, self.cfg.MODEL,
+                                   self.device_name)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_name = input
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_name': video_name}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        preds = self.model.forward(input['video_name'])
+        labels = sum([pred['actions']['labels'] for pred in preds], [])
+        scores = sum([pred['actions']['scores'] for pred in preds], [])
+        boxes = sum([pred['actions']['boxes'] for pred in preds], [])
+        timestamps = sum([[pred['timestamp']] * len(pred['actions']['labels'])
+                          for pred in preds], [])
+        out = {
+            OutputKeys.TIMESTAMPS: timestamps,
+            OutputKeys.LABELS: labels,
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: boxes
+        }
+        return out
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 960e9600..2265ef5a 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -58,6 +58,7 @@ class CVTasks(object):
     # video recognition
     live_category = 'live-category'
     action_recognition = 'action-recognition'
+    action_detection = 'action-detection'
     video_category = 'video-category'
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
diff --git a/tests/pipelines/test_action_detection.py b/tests/pipelines/test_action_detection.py
new file mode 100644
index 00000000..c752dc78
--- /dev/null
+++ b/tests/pipelines/test_action_detection.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ActionDetectionTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        action_detection_pipline = pipeline(
+            Tasks.action_detection,
+            model='damo/cv_ResNetC3D_action-detection_detection2d')
+        result = action_detection_pipline(
+            'data/test/videos/action_detection_test_video.mp4')
+        print('action detection results:', result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From f5fb8cf5318f3dfb0015484557dd0e03b9c42a8b Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 1 Sep 2022 18:56:51 +0800
Subject: [PATCH 02/28] [to #42322933] fix bug about loading new trained model
 and update doc string         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9987197

---
 modelscope/models/audio/ans/__init__.py       |  4 +-
 modelscope/models/audio/ans/complex_nn.py     |  6 ++
 modelscope/models/audio/ans/conv_stft.py      |  1 +
 modelscope/models/audio/ans/frcrn.py          | 62 +++----------------
 .../models/audio/ans/se_module_complex.py     |  1 +
 modelscope/models/audio/ans/unet.py           |  4 ++
 modelscope/trainers/audio/ans_trainer.py      |  7 +--
 modelscope/utils/audio/audio_utils.py         | 18 +++---
 8 files changed, 32 insertions(+), 71 deletions(-)

diff --git a/modelscope/models/audio/ans/__init__.py b/modelscope/models/audio/ans/__init__.py
index b602ad01..afcdf314 100644
--- a/modelscope/models/audio/ans/__init__.py
+++ b/modelscope/models/audio/ans/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .frcrn import FRCRNModel
+    from .frcrn import FRCRNDecorator
 
 else:
     _import_structure = {
-        'frcrn': ['FRCRNModel'],
+        'frcrn': ['FRCRNDecorator'],
     }
 
     import sys
diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index 69dec41e..c61446c2 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,3 +1,9 @@
+"""
+class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d are the work of
+Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
+from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+
+"""
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/audio/ans/conv_stft.py b/modelscope/models/audio/ans/conv_stft.py
index a47d7817..4b393a4c 100644
--- a/modelscope/models/audio/ans/conv_stft.py
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py
index 59411fbe..b74fc273 100644
--- a/modelscope/models/audio/ans/frcrn.py
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Dict
 
@@ -14,54 +15,10 @@ from .conv_stft import ConviSTFT, ConvSTFT
 from .unet import UNet
 
 
-class FTB(nn.Module):
-
-    def __init__(self, input_dim=257, in_channel=9, r_channel=5):
-
-        super(FTB, self).__init__()
-        self.in_channel = in_channel
-        self.conv1 = nn.Sequential(
-            nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
-            nn.BatchNorm2d(r_channel), nn.ReLU())
-
-        self.conv1d = nn.Sequential(
-            nn.Conv1d(
-                r_channel * input_dim, in_channel, kernel_size=9, padding=4),
-            nn.BatchNorm1d(in_channel), nn.ReLU())
-        self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)
-
-        self.conv2 = nn.Sequential(
-            nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
-            nn.BatchNorm2d(in_channel), nn.ReLU())
-
-    def forward(self, inputs):
-        '''
-        inputs should be [Batch, Ca, Dim, Time]
-        '''
-        # T-F attention
-        conv1_out = self.conv1(inputs)
-        B, C, D, T = conv1_out.size()
-        reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
-        conv1d_out = self.conv1d(reshape1_out)
-        conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])
-
-        # now is also [B,C,D,T]
-        att_out = conv1d_out * inputs
-
-        # tranpose to [B,C,T,D]
-        att_out = torch.transpose(att_out, 2, 3)
-        freqfc_out = self.freq_fc(att_out)
-        att_out = torch.transpose(freqfc_out, 2, 3)
-
-        cat_out = torch.cat([att_out, inputs], 1)
-        outputs = self.conv2(cat_out)
-        return outputs
-
-
 @MODELS.register_module(
     Tasks.acoustic_noise_suppression,
     module_name=Models.speech_frcrn_ans_cirm_16k)
-class FRCRNModel(TorchModel):
+class FRCRNDecorator(TorchModel):
     r""" A decorator of FRCRN for integrating into modelscope framework """
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -78,13 +35,14 @@ class FRCRNModel(TorchModel):
             checkpoint = torch.load(
                 model_bin_file, map_location=torch.device('cpu'))
             if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
-                self.model.load_state_dict(
-                    checkpoint['state_dict'], strict=False)
+                # the new trained model by user is based on FRCRNDecorator
+                self.load_state_dict(checkpoint['state_dict'])
             else:
+                # The released model on Modelscope is based on FRCRN
                 self.model.load_state_dict(checkpoint, strict=False)
 
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        result_list = self.model.forward(input['noisy'])
+    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        result_list = self.model.forward(inputs['noisy'])
         output = {
             'spec_l1': result_list[0],
             'wav_l1': result_list[1],
@@ -93,12 +51,12 @@ class FRCRNModel(TorchModel):
             'wav_l2': result_list[4],
             'mask_l2': result_list[5]
         }
-        if 'clean' in input:
+        if 'clean' in inputs:
             mix_result = self.model.loss(
-                input['noisy'], input['clean'], result_list, mode='Mix')
+                inputs['noisy'], inputs['clean'], result_list, mode='Mix')
             output.update(mix_result)
             sisnr_result = self.model.loss(
-                input['noisy'], input['clean'], result_list, mode='SiSNR')
+                inputs['noisy'], inputs['clean'], result_list, mode='SiSNR')
             output.update(sisnr_result)
             # logger hooker will use items under 'log_vars'
             output['log_vars'] = {k: mix_result[k].item() for k in mix_result}
diff --git a/modelscope/models/audio/ans/se_module_complex.py b/modelscope/models/audio/ans/se_module_complex.py
index f62fe523..b58eb6ba 100644
--- a/modelscope/models/audio/ans/se_module_complex.py
+++ b/modelscope/models/audio/ans/se_module_complex.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 from torch import nn
 
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index aa5a4254..ae66eb69 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,3 +1,7 @@
+"""
+Based on the work of Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
+from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+"""
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/trainers/audio/ans_trainer.py b/modelscope/trainers/audio/ans_trainer.py
index f782b836..37b201ce 100644
--- a/modelscope/trainers/audio/ans_trainer.py
+++ b/modelscope/trainers/audio/ans_trainer.py
@@ -1,10 +1,5 @@
-import time
-from typing import List, Optional, Union
-
-from datasets import Dataset
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from modelscope.metainfo import Trainers
-from modelscope.preprocessors import Preprocessor
 from modelscope.trainers import EpochBasedTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.utils.constant import TrainerStages
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 14374c65..61964345 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,5 +1,4 @@
-import numpy as np
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 SEGMENT_LENGTH_TRAIN = 16000
 
 
@@ -9,16 +8,13 @@ def to_segment(batch, segment_length=SEGMENT_LENGTH_TRAIN):
     It only works in batch mode.
     """
     noisy_arrays = []
-    for x in batch['noisy']:
-        length = len(x['array'])
-        noisy = np.array(x['array'])
-        for offset in range(segment_length, length, segment_length):
-            noisy_arrays.append(noisy[offset - segment_length:offset])
     clean_arrays = []
-    for x in batch['clean']:
-        length = len(x['array'])
-        clean = np.array(x['array'])
-        for offset in range(segment_length, length, segment_length):
+    for x, y in zip(batch['noisy'], batch['clean']):
+        length = min(len(x['array']), len(y['array']))
+        noisy = x['array']
+        clean = y['array']
+        for offset in range(segment_length, length + 1, segment_length):
+            noisy_arrays.append(noisy[offset - segment_length:offset])
             clean_arrays.append(clean[offset - segment_length:offset])
     return {'noisy': noisy_arrays, 'clean': clean_arrays}
 

From af4c6f70c296cbffdc6a5962791eed179ed611c7 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 1 Sep 2022 20:06:42 +0800
Subject: [PATCH 03/28] [to #42322933]allow none decorator registry in ast

---
 modelscope/utils/ast_utils.py | 65 ++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 990a9571..263a81b3 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -36,6 +36,7 @@ SCAN_SUB_FOLDERS = [
 ]
 INDEXER_FILE = 'ast_indexer'
 DECORATOR_KEY = 'decorators'
+EXPRESS_KEY = 'express'
 FROM_IMPORT_KEY = 'from_imports'
 IMPORT_KEY = 'imports'
 FILE_NAME_KEY = 'filepath'
@@ -45,6 +46,9 @@ INDEX_KEY = 'index'
 REQUIREMENT_KEY = 'requirements'
 MODULE_KEY = 'module'
 CLASS_NAME = 'class_name'
+GROUP_KEY = 'group_key'
+MODULE_NAME = 'module_name'
+MODULE_CLS = 'module_cls'
 
 
 class AstScaning(object):
@@ -53,6 +57,7 @@ class AstScaning(object):
         self.result_import = dict()
         self.result_from_import = dict()
         self.result_decorator = []
+        self.express = []
 
     def _is_sub_node(self, node: object) -> bool:
         return isinstance(node,
@@ -108,6 +113,7 @@ class AstScaning(object):
         self.result_import = dict()
         self.result_from_import = dict()
         self.result_decorator = []
+        self.result_express = []
 
     def scan_ast(self, node: Union[ast.AST, None, str]):
         self._setup_global()
@@ -243,13 +249,19 @@ class AstScaning(object):
                             setattr(item, CLASS_NAME, node.name)
                         self.result_decorator.extend(attr)
 
+                    if attr != [] and type(
+                            attr
+                    ).__name__ == 'Call' and parent_node_name == 'Expr':
+                        self.result_express.append(attr)
+
                     out += f'{indentstr()}{field}={representation},\n'
 
             out += indentstr() + ')'
             return {
                 IMPORT_KEY: self.result_import,
                 FROM_IMPORT_KEY: self.result_from_import,
-                DECORATOR_KEY: self.result_decorator
+                DECORATOR_KEY: self.result_decorator,
+                EXPRESS_KEY: self.result_express
             }, out
 
     def _parse_decorator(self, node: ast.AST) -> tuple:
@@ -267,7 +279,10 @@ class AstScaning(object):
         def _get_args_name(nodes: list) -> list:
             result = []
             for node in nodes:
-                result.append(_get_attribute_item(node))
+                if type(node).__name__ == 'Str':
+                    result.append((node.s, None))
+                else:
+                    result.append(_get_attribute_item(node))
             return result
 
         def _get_keyword_name(nodes: ast.AST) -> list:
@@ -276,9 +291,11 @@ class AstScaning(object):
                 if type(node).__name__ == 'keyword':
                     attribute_node = getattr(node, 'value')
                     if type(attribute_node).__name__ == 'Str':
-                        result.append((attribute_node.s, None))
+                        result.append((getattr(node,
+                                               'arg'), attribute_node.s, None))
                     else:
-                        result.append(_get_attribute_item(attribute_node))
+                        result.append((getattr(node, 'arg'), )
+                                      + _get_attribute_item(attribute_node))
             return result
 
         functions = _get_attribute_item(node.func)
@@ -315,10 +332,26 @@ class AstScaning(object):
             args_list.append(default_group)
         if len(keyword_list) == 0 and len(args_list) == 1:
             args_list.append(class_name)
-        if len(keyword_list) == 1 and len(args_list) == 0:
+
+        if len(keyword_list) > 0 and len(args_list) == 0:
+            remove_group_item = None
+            for item in keyword_list:
+                key, name, attr = item
+                if key == GROUP_KEY:
+                    args_list.append((name, attr))
+                    remove_group_item = item
+            if remove_group_item is not None:
+                keyword_list.remove(remove_group_item)
+
+        if len(args_list) == 0:
             args_list.append(default_group)
 
-        args_list.extend(keyword_list)
+        for item in keyword_list:
+            key, name, attr = item
+            if key == MODULE_CLS:
+                class_name = name
+            else:
+                args_list.append((name, attr))
 
         for item in args_list:
             # the case empty input
@@ -347,9 +380,14 @@ class AstScaning(object):
         for node in nodes:
             if type(node).__name__ != 'Call':
                 continue
+            class_name = getattr(node, CLASS_NAME, None)
+            func = getattr(node, 'func')
+
+            if getattr(func, 'attr', None) != REGISTER_MODULE:
+                continue
+
             parse_output = self._parse_decorator(node)
-            index = self._registry_indexer(parse_output,
-                                           getattr(node, CLASS_NAME))
+            index = self._registry_indexer(parse_output, class_name)
             if None is not index:
                 results.append(index)
         return results
@@ -363,6 +401,8 @@ class AstScaning(object):
         node = gast.parse(data)
         output, _ = self.scan_import(node, indent='  ', show_offsets=False)
         output[DECORATOR_KEY] = self.parse_decorators(output[DECORATOR_KEY])
+        output[EXPRESS_KEY] = self.parse_decorators(output[EXPRESS_KEY])
+        output[DECORATOR_KEY].extend(output[EXPRESS_KEY])
         return output
 
 
@@ -481,6 +521,13 @@ class FilesAstScaning(object):
             module_import[value_dict[MODULE_KEY]] = value_dict[IMPORT_KEY]
         return module_import
 
+    def _ignore_useless_keys(self, inverted_index):
+        if ('OPTIMIZERS', 'default', 'name') in inverted_index:
+            del inverted_index[('OPTIMIZERS', 'default', 'name')]
+        if ('LR_SCHEDULER', 'default', 'name') in inverted_index:
+            del inverted_index[('LR_SCHEDULER', 'default', 'name')]
+        return inverted_index
+
     def get_files_scan_results(self,
                                target_dir=MODELSCOPE_PATH,
                                target_folders=SCAN_SUB_FOLDERS):
@@ -514,6 +561,8 @@ class FilesAstScaning(object):
                 MODULE_KEY: module_name
             }
         inverted_index_with_results = self._inverted_index(result)
+        inverted_index_with_results = self._ignore_useless_keys(
+            inverted_index_with_results)
         module_import = self._module_import(result)
         index = {
             INDEX_KEY: inverted_index_with_results,

From 780330897a47bf24437090e48cf4350dae7af8ed Mon Sep 17 00:00:00 2001
From: "peter.lx" <peter.lx@alibaba-inc.com>
Date: Thu, 1 Sep 2022 22:17:14 +0800
Subject: [PATCH 04/28] [to #42322933] add Deberta v2 modeling and fill_mask
 task, with master merged

add Deberta v2 modeling and fill_mask task, with master merged
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9966511
---
 modelscope/metainfo.py                        |    1 +
 modelscope/models/nlp/__init__.py             |   16 +-
 modelscope/models/nlp/deberta_v2/__init__.py  |   73 +
 .../deberta_v2/configuration_deberta_v2.py    |  130 ++
 .../nlp/deberta_v2/modeling_deberta_v2.py     | 1789 +++++++++++++++++
 .../nlp/deberta_v2/tokenization_deberta_v2.py |  546 +++++
 .../tokenization_deberta_v2_fast.py           |  241 +++
 modelscope/models/nlp/masked_language.py      |   39 +
 .../pipelines/nlp/fill_mask_pipeline.py       |   16 +-
 modelscope/preprocessors/nlp.py               |    3 +
 tests/pipelines/test_deberta_tasks.py         |   62 +
 11 files changed, 2907 insertions(+), 9 deletions(-)
 create mode 100644 modelscope/models/nlp/deberta_v2/__init__.py
 create mode 100644 modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
 create mode 100644 modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
 create mode 100644 modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
 create mode 100644 modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
 create mode 100644 tests/pipelines/test_deberta_tasks.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 7c5afe80..971dd3f1 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -37,6 +37,7 @@ class Models(object):
     bert = 'bert'
     palm = 'palm-v2'
     structbert = 'structbert'
+    deberta_v2 = 'deberta_v2'
     veco = 'veco'
     translation = 'csanmt-translation'
     space_dst = 'space-dst'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index e17a1d31..fd61e40b 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -9,12 +9,15 @@ if TYPE_CHECKING:
     from .bert_for_sequence_classification import BertForSequenceClassification
     from .bert_for_document_segmentation import BertForDocumentSegmentation
     from .csanmt_for_translation import CsanmtForTranslation
-    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
-                                  BertForMaskedLM)
+    from .masked_language import (
+        StructBertForMaskedLM,
+        VecoForMaskedLM,
+        BertForMaskedLM,
+        DebertaV2ForMaskedLM,
+    )
     from .nncrf_for_named_entity_recognition import (
         TransformerCRFForNamedEntityRecognition,
         LSTMCRFForNamedEntityRecognition)
-    from .palm_v2 import PalmForTextGeneration
     from .token_classification import SbertForTokenClassification
     from .sequence_classification import VecoForSequenceClassification, SbertForSequenceClassification
     from .space import SpaceForDialogIntent
@@ -22,7 +25,6 @@ if TYPE_CHECKING:
     from .space import SpaceForDialogStateTracking
     from .star_text_to_sql import StarForTextToSql
     from .task_models import (InformationExtractionModel,
-                              SequenceClassificationModel,
                               SingleBackboneTaskModelBase)
     from .bart_for_text_error_correction import BartForTextErrorCorrection
     from .gpt3 import GPT3ForTextGeneration
@@ -36,8 +38,10 @@ else:
         'csanmt_for_translation': ['CsanmtForTranslation'],
         'bert_for_sequence_classification': ['BertForSequenceClassification'],
         'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
-        'masked_language':
-        ['StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM'],
+        'masked_language': [
+            'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
+            'DebertaV2ForMaskedLM'
+        ],
         'nncrf_for_named_entity_recognition': [
             'TransformerCRFForNamedEntityRecognition',
             'LSTMCRFForNamedEntityRecognition'
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
new file mode 100644
index 00000000..664fc6c6
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -0,0 +1,73 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+_import_structure = {
+    'configuration_deberta_v2': [
+        'DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config',
+        'DebertaV2OnnxConfig'
+    ],
+    'tokenization_deberta_v2': ['DebertaV2Tokenizer'],
+}
+
+if TYPE_CHECKING:
+    from .configuration_deberta_v2 import DebertaV2Config
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+    from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
+
+    from .modeling_deberta_v2 import (
+        DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+        DebertaV2ForMaskedLM,
+        DebertaV2ForMultipleChoice,
+        DebertaV2ForQuestionAnswering,
+        DebertaV2ForSequenceClassification,
+        DebertaV2ForTokenClassification,
+        DebertaV2Model,
+        DebertaV2PreTrainedModel,
+    )
+
+else:
+    _import_structure = {
+        'configuration_deberta_v2':
+        ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
+        'tokenization_deberta_v2': ['DebertaV2Tokenizer']
+    }
+    _import_structure['tokenization_deberta_v2_fast'] = [
+        'DebertaV2TokenizerFast'
+    ]
+    _import_structure['modeling_deberta_v2'] = [
+        'DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST',
+        'DebertaV2ForMaskedLM',
+        'DebertaV2ForMultipleChoice',
+        'DebertaV2ForQuestionAnswering',
+        'DebertaV2ForSequenceClassification',
+        'DebertaV2ForTokenClassification',
+        'DebertaV2Model',
+        'DebertaV2PreTrainedModel',
+    ]
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__)
diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
new file mode 100644
index 00000000..65e8f0b7
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
@@ -0,0 +1,130 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020, Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+from transformers import PretrainedConfig
+
+from modelscope.utils import logger as logging
+
+logger = logging.get_logger(__name__)
+
+
+class DebertaV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used to instantiate a
+    DeBERTa-v2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-v2-xlarge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 128100):
+            Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DebertaV2Model`].
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"`
+            are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 0):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
+            The epsilon used by the layer normalization layers.
+        relative_attention (`bool`, *optional*, defaults to `True`):
+            Whether use relative position encoding.
+        max_relative_positions (`int`, *optional*, defaults to -1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value
+            as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The value used to pad input_ids.
+        position_biased_input (`bool`, *optional*, defaults to `False`):
+            Whether add absolute position embedding to content embedding.
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
+            `["p2c", "c2p"]`, `["p2c", "c2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    """
+    model_type = 'deberta_v2'
+
+    def __init__(self,
+                 vocab_size=128100,
+                 hidden_size=1536,
+                 num_hidden_layers=24,
+                 num_attention_heads=24,
+                 intermediate_size=6144,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=0,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-7,
+                 relative_attention=False,
+                 max_relative_positions=-1,
+                 pad_token_id=0,
+                 position_biased_input=True,
+                 pos_att_type=None,
+                 pooler_dropout=0,
+                 pooler_hidden_act='gelu',
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.relative_attention = relative_attention
+        self.max_relative_positions = max_relative_positions
+        self.pad_token_id = pad_token_id
+        self.position_biased_input = position_biased_input
+
+        # Backwards compatibility
+        if type(pos_att_type) == str:
+            pos_att_type = [x.strip() for x in pos_att_type.lower().split('|')]
+
+        self.pos_att_type = pos_att_type
+        self.vocab_size = vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        self.pooler_hidden_size = kwargs.get('pooler_hidden_size', hidden_size)
+        self.pooler_dropout = pooler_dropout
+        self.pooler_hidden_act = pooler_hidden_act
diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
new file mode 100644
index 00000000..1c6b9071
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
@@ -0,0 +1,1789 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeBERTa-v2 model."""
+
+from collections.abc import Sequence
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward)
+from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput,
+                                           MultipleChoiceModelOutput,
+                                           QuestionAnsweringModelOutput,
+                                           SequenceClassifierOutput,
+                                           TokenClassifierOutput)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import softmax_backward_data
+
+from modelscope.utils import logger as logging
+from .configuration_deberta_v2 import DebertaV2Config
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'DebertaV2Config'
+_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer'
+_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite'
+
+
+# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
+class ContextPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.pooler_hidden_size,
+                               config.pooler_hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+
+        context_token = hidden_states[:, 0]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
+        return pooled_output
+
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2
+class XSoftmax(torch.autograd.Function):
+    """
+    Masked Softmax which is optimized for saving memory
+
+    Args:
+        input (`torch.tensor`): The input tensor that will apply softmax.
+        mask (`torch.IntTensor`):
+            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
+        dim (int): The dimension that will apply softmax
+
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
+
+    >>> # Make a tensor
+    >>> x = torch.randn([4, 20, 100])
+
+    >>> # Create a mask
+    >>> mask = (x > 0).int()
+
+    >>> # Specify the dimension to apply softmax
+    >>> dim = -1
+
+    >>> y = XSoftmax.apply(x, mask, dim)
+    ```"""
+
+    @staticmethod
+    def forward(self, input, mask, dim):
+        self.dim = dim
+        rmask = ~(mask.to(torch.bool))
+
+        output = input.masked_fill(rmask,
+                                   torch.tensor(torch.finfo(input.dtype).min))
+        output = torch.softmax(output, self.dim)
+        output.masked_fill_(rmask, 0)
+        self.save_for_backward(output)
+        return output
+
+    @staticmethod
+    def backward(self, grad_output):
+        (output, ) = self.saved_tensors
+        inputGrad = softmax_backward_data(self, grad_output, output, self.dim,
+                                          output)
+        return inputGrad, None, None
+
+    @staticmethod
+    def symbolic(g, self, mask, dim):
+        import torch.onnx.symbolic_helper as sym_help
+        from torch.onnx.symbolic_opset9 import masked_fill, softmax
+
+        mask_cast_value = g.op(
+            'Cast', mask, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+        r_mask = g.op(
+            'Cast',
+            g.op('Sub',
+                 g.op('Constant', value_t=torch.tensor(1, dtype=torch.int64)),
+                 mask_cast_value),
+            to_i=sym_help.cast_pytorch_to_onnx['Byte'],
+        )
+        output = masked_fill(
+            g, self, r_mask,
+            g.op(
+                'Constant',
+                value_t=torch.tensor(torch.finfo(self.type().dtype()).min)))
+        output = softmax(g, output, dim)
+        return masked_fill(
+            g, output, r_mask,
+            g.op('Constant', value_t=torch.tensor(0, dtype=torch.uint8)))
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DropoutContext
+class DropoutContext(object):
+
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+
+
+# Copied from transformers.models.deberta.modeling_deberta.get_mask
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+
+    if dropout > 0 and mask is None:
+        mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(
+            torch.bool)
+
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+
+    return mask, dropout
+
+
+# Copied from transformers.models.deberta.modeling_deberta.XDropout
+class XDropout(torch.autograd.Function):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask, ) = ctx.saved_tensors
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+
+    @staticmethod
+    def symbolic(g: torch._C.Graph, input: torch._C.Value,
+                 local_ctx: Union[float, DropoutContext]) -> torch._C.Value:
+        from torch.onnx import symbolic_opset12
+
+        dropout_p = local_ctx
+        if isinstance(local_ctx, DropoutContext):
+            dropout_p = local_ctx.dropout
+        # StableDropout only calls this function when training.
+        train = True
+        # TODO: We should check if the opset_version being used to export
+        # is > 12 here, but there's no good way to do that. As-is, if the
+        # opset_version < 12, export will fail with a CheckerError.
+        # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like:
+        # if opset_version < 12:
+        #   return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train)
+        return symbolic_opset12.dropout(g, input, dropout_p, train)
+
+
+# Copied from transformers.models.deberta.modeling_deberta.StableDropout
+class StableDropout(nn.Module):
+    """
+    Optimized dropout module for stabilizing the training
+
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+
+    def forward(self, x):
+        """
+        Call the module
+
+        Args:
+            x (`torch.tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2SelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2
+class DebertaV2Attention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = DebertaV2SelfOutput(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            self_output, att_matrix = self_output
+        if query_states is None:
+            query_states = hidden_states
+        attention_output = self.output(self_output, query_states)
+
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2
+class DebertaV2Intermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm
+class DebertaV2Output(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2
+class DebertaV2Layer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.attention = DebertaV2Attention(config)
+        self.intermediate = DebertaV2Intermediate(config)
+        self.output = DebertaV2Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+
+
+class ConvLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = getattr(config, 'conv_kernel_size', 3)
+        groups = getattr(config, 'conv_groups', 1)
+        self.conv_act = getattr(config, 'conv_act', 'tanh')
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            groups=groups)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(
+            0, 2, 1).contiguous()
+        rmask = (1 - input_mask).bool()
+        out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
+        out = ACT2FN[self.conv_act](self.dropout(out))
+
+        layer_norm_input = residual_states + out
+        output = self.LayerNorm(layer_norm_input).to(layer_norm_input)
+
+        if input_mask is None:
+            output_states = output
+        else:
+            if input_mask.dim() != layer_norm_input.dim():
+                if input_mask.dim() == 4:
+                    input_mask = input_mask.squeeze(1).squeeze(1)
+                input_mask = input_mask.unsqueeze(2)
+
+            input_mask = input_mask.to(output.dtype)
+            output_states = output * input_mask
+
+        return output_states
+
+
+class DebertaV2Encoder(nn.Module):
+    """Modified BertEncoder with relative position bias support"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.layer = nn.ModuleList(
+            [DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config,
+                                                  'max_relative_positions', -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            pos_ebd_size = self.max_relative_positions * 2
+
+            if self.position_buckets > 0:
+                pos_ebd_size = self.position_buckets * 2
+
+            self.rel_embeddings = nn.Embedding(pos_ebd_size,
+                                               config.hidden_size)
+
+        self.norm_rel_ebd = [
+            x.strip()
+            for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')
+        ]
+
+        if 'layer_norm' in self.norm_rel_ebd:
+            self.LayerNorm = LayerNorm(
+                config.hidden_size,
+                config.layer_norm_eps,
+                elementwise_affine=True)
+
+        self.conv = ConvLayer(config) if getattr(config, 'conv_kernel_size',
+                                                 0) > 0 else None
+        self.gradient_checkpointing = False
+
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd):
+            rel_embeddings = self.LayerNorm(rel_embeddings)
+        return rel_embeddings
+
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(
+                -2).unsqueeze(-1)
+            attention_mask = attention_mask.byte()
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+
+        return attention_mask
+
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.size(
+                -2) if query_states is not None else hidden_states.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                hidden_states.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions)
+        return relative_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=True,
+    ):
+        if attention_mask.dim() <= 2:
+            input_mask = attention_mask
+        else:
+            input_mask = (attention_mask.sum(-2) > 0).byte()
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states,
+                                        relative_pos)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        output_states = next_kv
+        for i, layer_module in enumerate(self.layer):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output_states, )
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                output_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    next_kv,
+                    attention_mask,
+                    query_states,
+                    relative_pos,
+                    rel_embeddings,
+                )
+            else:
+                output_states = layer_module(
+                    next_kv,
+                    attention_mask,
+                    query_states=query_states,
+                    relative_pos=relative_pos,
+                    rel_embeddings=rel_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            if output_attentions:
+                output_states, att_m = output_states
+
+            if i == 0 and self.conv is not None:
+                output_states = self.conv(hidden_states, output_states,
+                                          input_mask)
+
+            if query_states is not None:
+                query_states = output_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(
+                        self.layer) else None
+            else:
+                next_kv = output_states
+
+            if output_attentions:
+                all_attentions = all_attentions + (att_m, )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (output_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [output_states, all_hidden_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=output_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions)
+
+
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+    sign = torch.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = torch.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        torch.tensor(mid - 1).type_as(relative_pos),
+        torch.abs(relative_pos),
+    )
+    log_pos = (
+        torch.ceil(
+            torch.log(abs_pos / mid)
+            / torch.log(torch.tensor(
+                (max_position - 1) / mid)) * (mid - 1)) + mid)
+    bucket_pos = torch.where(abs_pos <= mid, relative_pos.type_as(log_pos),
+                             log_pos * sign)
+    return bucket_pos
+
+
+def build_relative_position(query_size,
+                            key_size,
+                            bucket_size=-1,
+                            max_position=-1):
+    """
+    Build relative position according to the query and key
+
+    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+    P_k\\)
+
+    Args:
+        query_size (int): the length of query
+        key_size (int): the length of key
+        bucket_size (int): the size of position bucket
+        max_position (int): the maximum allowed absolute position
+
+    Return:
+        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]
+
+    """
+    q_ids = torch.arange(0, query_size)
+    k_ids = torch.arange(0, key_size)
+    rel_pos_ids = q_ids[:, None] - k_ids[None, :]
+    if bucket_size > 0 and max_position > 0:
+        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size,
+                                               max_position)
+    rel_pos_ids = rel_pos_ids.to(torch.long)
+    rel_pos_ids = rel_pos_ids[:query_size, :]
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return c2p_pos.expand([
+        query_layer.size(0),
+        query_layer.size(1),
+        query_layer.size(2),
+        relative_pos.size(-1)
+    ])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return c2p_pos.expand([
+        query_layer.size(0),
+        query_layer.size(1),
+        key_layer.size(-2),
+        key_layer.size(-2)
+    ])
+
+
+@torch.jit.script
+# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return pos_index.expand(p2c_att.size()[:2]
+                            + (pos_index.size(-2), key_layer.size(-2)))
+
+
+class DisentangledSelfAttention(nn.Module):
+    """
+    Disentangled self-attention module
+
+    Parameters:
+        config (`DebertaV2Config`):
+            A model config class instance with the configuration to build a new model. The schema is similar to
+            *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = getattr(config, 'attention_head_size',
+                                           _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+        self.key_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+        self.value_proj = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=True)
+
+        self.share_att_key = getattr(config, 'share_att_key', False)
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+        self.relative_attention = getattr(config, 'relative_attention', False)
+
+        if self.relative_attention:
+            self.position_buckets = getattr(config, 'position_buckets', -1)
+            self.max_relative_positions = getattr(config,
+                                                  'max_relative_positions', -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets > 0:
+                self.pos_ebd_size = self.position_buckets
+
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+
+            if not self.share_att_key:
+                if 'c2p' in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(
+                        config.hidden_size, self.all_head_size, bias=True)
+                if 'p2c' in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size,
+                                                    self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.size()[:-1] + (attention_heads, -1)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1),
+                                                       x.size(-1))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        """
+        Call the module
+
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
+                *Attention(Q,K,V)*
+
+            attention_mask (`torch.ByteTensor`):
+                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
+                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
+                th token.
+
+            output_attentions (`bool`, optional):
+                Whether return the attention matrix.
+
+            query_states (`torch.FloatTensor`, optional):
+                The *Q* state in *Attention(Q,K,V)*.
+
+            relative_pos (`torch.LongTensor`):
+                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
+                values ranging in [*-max_relative_positions*, *max_relative_positions*].
+
+            rel_embeddings (`torch.FloatTensor`):
+                The embedding of relative distances. It's a tensor of shape [\\(2 \\times
+                \\text{max_relative_positions}\\), *hidden_size*].
+
+
+        """
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(
+            self.query_proj(query_states), self.num_attention_heads)
+        key_layer = self.transpose_for_scores(
+            self.key_proj(hidden_states), self.num_attention_heads)
+        value_layer = self.transpose_for_scores(
+            self.value_proj(hidden_states), self.num_attention_heads)
+
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if 'c2p' in self.pos_att_type:
+            scale_factor += 1
+        if 'p2c' in self.pos_att_type:
+            scale_factor += 1
+        scale = torch.sqrt(
+            torch.tensor(query_layer.size(-1), dtype=torch.float)
+            * scale_factor)
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(
+            -1, -2)) / torch.tensor(
+                scale, dtype=query_layer.dtype)
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(query_layer, key_layer,
+                                                       relative_pos,
+                                                       rel_embeddings,
+                                                       scale_factor)
+
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+        attention_scores = attention_scores
+        attention_scores = attention_scores.view(-1, self.num_attention_heads,
+                                                 attention_scores.size(-2),
+                                                 attention_scores.size(-1))
+
+        # bsz x height x length x dimension
+        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.bmm(
+            attention_probs.view(-1, attention_probs.size(-2),
+                                 attention_probs.size(-1)), value_layer)
+        context_layer = (
+            context_layer.view(-1, self.num_attention_heads,
+                               context_layer.size(-2),
+                               context_layer.size(-1)).permute(0, 2, 1,
+                                                               3).contiguous())
+        new_context_layer_shape = context_layer.size()[:-2] + (-1, )
+        context_layer = context_layer.view(new_context_layer_shape)
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos,
+                                    rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.size(-2)
+            relative_pos = build_relative_position(
+                q,
+                key_layer.size(-2),
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions)
+        if relative_pos.dim() == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.dim() == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bsz x height x query x key
+        elif relative_pos.dim() != 4:
+            raise ValueError(
+                f'Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}'
+            )
+
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.long().to(query_layer.device)
+
+        rel_embeddings = rel_embeddings[0:att_span * 2, :].unsqueeze(0)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(
+                self.query_proj(rel_embeddings),
+                self.num_attention_heads).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1)
+            pos_key_layer = self.transpose_for_scores(
+                self.key_proj(rel_embeddings),
+                self.num_attention_heads).repeat(
+                    query_layer.size(0) // self.num_attention_heads, 1, 1)
+        else:
+            if 'c2p' in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(
+                    self.pos_key_proj(rel_embeddings),
+                    self.num_attention_heads).repeat(
+                        query_layer.size(0) // self.num_attention_heads, 1,
+                        1)  # .split(self.all_head_size, dim=-1)
+            if 'p2c' in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(
+                    self.pos_query_proj(rel_embeddings),
+                    self.num_attention_heads).repeat(
+                        query_layer.size(0) // self.num_attention_heads, 1,
+                        1)  # .split(self.all_head_size, dim=-1)
+
+        score = 0
+        # content->position
+        if 'c2p' in self.pos_att_type:
+            scale = torch.sqrt(
+                torch.tensor(pos_key_layer.size(-1), dtype=torch.float)
+                * scale_factor)
+            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+            c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = torch.gather(
+                c2p_att,
+                dim=-1,
+                index=c2p_pos.squeeze(0).expand([
+                    query_layer.size(0),
+                    query_layer.size(1),
+                    relative_pos.size(-1)
+                ]),
+            )
+            score += c2p_att / torch.tensor(scale, dtype=c2p_att.dtype)
+
+        # position->content
+        if 'p2c' in self.pos_att_type:
+            scale = torch.sqrt(
+                torch.tensor(pos_query_layer.size(-1), dtype=torch.float)
+                * scale_factor)
+            if key_layer.size(-2) != query_layer.size(-2):
+                r_pos = build_relative_position(
+                    key_layer.size(-2),
+                    key_layer.size(-2),
+                    bucket_size=self.position_buckets,
+                    max_position=self.max_relative_positions,
+                ).to(query_layer.device)
+                r_pos = r_pos.unsqueeze(0)
+            else:
+                r_pos = relative_pos
+
+            p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+            p2c_att = torch.gather(
+                p2c_att,
+                dim=-1,
+                index=p2c_pos.squeeze(0).expand([
+                    query_layer.size(0),
+                    key_layer.size(-2),
+                    key_layer.size(-2)
+                ]),
+            ).transpose(-1, -2)
+            score += p2c_att / torch.tensor(scale, dtype=p2c_att.dtype)
+
+        return score
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm
+class DebertaV2Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        pad_token_id = getattr(config, 'pad_token_id', 0)
+        self.embedding_size = getattr(config, 'embedding_size',
+                                      config.hidden_size)
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+        self.position_biased_input = getattr(config, 'position_biased_input',
+                                             True)
+        if not self.position_biased_input:
+            self.position_embeddings = None
+        else:
+            self.position_embeddings = nn.Embedding(
+                config.max_position_embeddings, self.embedding_size)
+
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                      self.embedding_size)
+
+        if self.embedding_size != config.hidden_size:
+            self.embed_proj = nn.Linear(
+                self.embedding_size, config.hidden_size, bias=False)
+        self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                mask=None,
+                inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if self.position_embeddings is not None:
+            position_embeddings = self.position_embeddings(position_ids.long())
+        else:
+            position_embeddings = torch.zeros_like(inputs_embeds)
+
+        embeddings = inputs_embeds
+        if self.position_biased_input:
+            embeddings += position_embeddings
+        if self.config.type_vocab_size > 0:
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+
+        if self.embedding_size != self.config.hidden_size:
+            embeddings = self.embed_proj(embeddings)
+
+        embeddings = self.LayerNorm(embeddings)
+
+        if mask is not None:
+            if mask.dim() != embeddings.dim():
+                if mask.dim() == 4:
+                    mask = mask.squeeze(1).squeeze(1)
+                mask = mask.unsqueeze(2)
+            mask = mask.to(embeddings.dtype)
+
+            embeddings = embeddings * mask
+
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
+class DebertaV2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DebertaV2Config
+    base_model_prefix = 'deberta'
+    _keys_to_ignore_on_load_missing = ['position_ids']
+    _keys_to_ignore_on_load_unexpected = ['position_embeddings']
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DebertaV2Encoder):
+            module.gradient_checkpointing = value
+
+
+DEBERTA_START_DOCSTRING = r"""
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+
+    Parameters:
+        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DEBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.',
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = DebertaV2Embeddings(config)
+        self.encoder = DebertaV2Encoder(config)
+        self.z_steps = 0
+        self.config = config
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError(
+            'The prune function is not implemented in DeBERTa model.')
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        encoded_layers = encoder_outputs[1]
+
+        if self.z_steps > 1:
+            hidden_states = encoded_layers[-2]
+            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+            query_states = encoded_layers[-1]
+            rel_embeddings = self.encoder.get_rel_embedding()
+            attention_mask = self.encoder.get_attention_mask(attention_mask)
+            rel_pos = self.encoder.get_rel_pos(embedding_output)
+            for layer in layers[1:]:
+                query_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=False,
+                    query_states=query_states,
+                    relative_pos=rel_pos,
+                    rel_embeddings=rel_embeddings,
+                )
+                encoded_layers.append(query_states)
+
+        sequence_output = encoded_layers[-1]
+
+        if not return_dict:
+            return (sequence_output, ) + encoder_outputs[
+                (1 if output_hidden_states else 2):]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states
+            if output_hidden_states else None,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """DeBERTa Model with a `language modeling` head on top.""",
+    DEBERTA_START_DOCSTRING)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[1:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+class DebertaV2PredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+class DebertaV2LMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
+class DebertaV2OnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
+class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, 'num_labels', 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, num_labels)
+        drop_out = getattr(config, 'cls_dropout', None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    # regression task
+                    loss_fn = nn.MSELoss()
+                    logits = logits.view(-1).to(labels.dtype)
+                    loss = loss_fn(logits, labels.view(-1))
+                elif labels.dim() == 1 or labels.size(-1) == 1:
+                    label_index = (labels >= 0).nonzero()
+                    labels = labels.long()
+                    if label_index.size(0) > 0:
+                        labeled_logits = torch.gather(
+                            logits, 0,
+                            label_index.expand(
+                                label_index.size(0), logits.size(1)))
+                        labels = torch.gather(labels, 0, label_index.view(-1))
+                        loss_fct = CrossEntropyLoss()
+                        loss = loss_fct(
+                            labeled_logits.view(-1, self.num_labels).float(),
+                            labels.view(-1))
+                    else:
+                        loss = torch.tensor(0).to(logits)
+                else:
+                    log_softmax = nn.LogSoftmax(-1)
+                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
+            elif self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
+class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    DEBERTA_START_DOCSTRING,
+)
+class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        num_labels = getattr(config, 'num_labels', 2)
+        self.num_labels = num_labels
+
+        self.deberta = DebertaV2Model(config)
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim
+
+        self.classifier = nn.Linear(output_dim, 1)
+        drop_out = getattr(config, 'cls_dropout', None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+        self.dropout = StableDropout(drop_out)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.deberta.set_input_embeddings(new_embeddings)
+
+    @add_start_docstrings_to_model_forward(
+        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.deberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        encoder_layer = outputs[0]
+        pooled_output = self.pooler(encoder_layer)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[1:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
new file mode 100644
index 00000000..adb60288
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
@@ -0,0 +1,546 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DeBERTa. mainly copied from :module:`~transformers.tokenization_deberta`"""
+
+import os
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as sp
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+VOCAB_FILES_NAMES = {'vocab_file': 'spm.model'}
+
+
+class DebertaV2Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
+    and [jieba](https://github.com/fxsjy/jieba).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self,
+                 vocab_file,
+                 do_lower_case=False,
+                 split_by_punct=False,
+                 split_chinese=True,
+                 bos_token='[CLS]',
+                 eos_token='[SEP]',
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None,
+                 **kwargs) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            split_chinese=split_chinese,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                ' model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`'
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.split_chinese = split_chinese
+        self.vocab_file = vocab_file
+        self._tokenizer = SPMTokenizer(
+            vocab_file,
+            split_by_punct=split_by_punct,
+            sp_model_kwargs=self.sp_model_kwargs)
+        self.jieba = None
+        if self.split_chinese:
+            try:
+                import jieba
+            except ImportError:
+                raise ImportError(
+                    'You need to install jieba to split chinese and use DebertaV2Tokenizer. '
+                    'See https://pypi.org/project/jieba/ for installation.')
+            self.jieba = jieba
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+
+    def get_vocab(self):
+        vocab = self.vocab.copy()
+        vocab.update(self.get_added_vocab())
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if self.do_lower_case:
+            text = text.lower()
+        if self.split_chinese:
+            seg_list = [x for x in self.jieba.cut(text)]
+            text = ' '.join(seg_list)
+        return self._tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self._tokenizer.spm.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self._tokenizer.spm.IdToPiece(
+            index) if index < self.vocab_size else self.unk_token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        return self._tokenizer.decode(tokens)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self,
+                                             token_ids_0,
+                                             token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def prepare_for_tokenization(self,
+                                 text,
+                                 is_split_into_words=False,
+                                 **kwargs):
+        add_prefix_space = kwargs.pop('add_prefix_space', False)
+        if is_split_into_words or add_prefix_space:
+            text = ' ' + text
+        return (text, kwargs)
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return self._tokenizer.save_pretrained(
+            save_directory, filename_prefix=filename_prefix)
+
+
+class SPMTokenizer:
+    r"""
+    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    def __init__(self,
+                 vocab_file,
+                 split_by_punct=False,
+                 sp_model_kwargs: Optional[Dict[str, Any]] = None):
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        if not os.path.exists(vocab_file):
+            raise FileNotFoundError(f'{vocab_file} does not exist!')
+        spm.load(vocab_file)
+        bpe_vocab_size = spm.GetPieceSize()
+        # Token map
+        # <unk> 0+1
+        # <s> 1+1
+        # </s> 2+1
+        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+        # self.vocab['[PAD]'] = 0
+        # self.vocab['[CLS]'] = 1
+        # self.vocab['[SEP]'] = 2
+        # self.vocab['[UNK]'] = 3
+
+        self.spm = spm
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['spm'] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, 'sp_model_kwargs'):
+            self.sp_model_kwargs = {}
+
+        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.spm.Load(self.vocab_file)
+
+    def tokenize(self, text):
+        return self._encode_as_pieces(text)
+
+    def convert_ids_to_tokens(self, ids):
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def decode(self, tokens, start=-1, end=-1, raw_text=None):
+        if raw_text is None:
+            return self.spm.decode_pieces([t for t in tokens])
+        else:
+            words = self.split_to_words(raw_text)
+            word_tokens = [self.tokenize(w) for w in words]
+            token2words = [0] * len(tokens)
+            tid = 0
+            for i, w in enumerate(word_tokens):
+                for k, t in enumerate(w):
+                    token2words[tid] = i
+                    tid += 1
+            word_start = token2words[start]
+            word_end = token2words[end] if end < len(tokens) else len(words)
+            text = ''.join(words[word_start:word_end])
+            return text
+
+    def add_special_token(self, token):
+        if token not in self.special_tokens:
+            self.special_tokens.append(token)
+            if token not in self.vocab:
+                self.vocab[token] = len(self.vocab) - 1
+                self.ids_to_tokens.append(token)
+        return self.id(token)
+
+    def part_of_whole_word(self, token, is_bos=False):
+        if is_bos:
+            return True
+        if (len(token) == 1 and (_is_whitespace(list(token)[0]))):
+            return False
+        if _is_control(list(token)[0]):
+            return False
+        if _is_punctuation(list(token)[0]):
+            return False
+        if token in self.add_special_token:
+            return False
+
+        word_start = b'\xe2\x96\x81'.decode('utf-8')
+        return not token.startswith(word_start)
+
+    def pad(self):
+        return '[PAD]'
+
+    def bos(self):
+        return '[CLS]'
+
+    def eos(self):
+        return '[SEP]'
+
+    def unk(self):
+        return '[UNK]'
+
+    def mask(self):
+        return '[MASK]'
+
+    def sym(self, id):
+        return self.ids_to_tokens[id]
+
+    def id(self, sym):
+        return self.vocab[sym] if sym in self.vocab else 1
+
+    def _encode_as_pieces(self, text):
+        text = convert_to_unicode(text)
+        if self.split_by_punct:
+            words = self._run_split_on_punc(text)
+            pieces = [self.spm.encode(w, out_type=str) for w in words]
+            return [p for w in pieces for p in w]
+        else:
+            return self.spm.encode(text, out_type=str)
+
+    def split_to_words(self, text):
+        pieces = self._encode_as_pieces(text)
+        word_start = b'\xe2\x96\x81'.decode('utf-8')
+        words = []
+        offset = 0
+        prev_end = 0
+        for i, p in enumerate(pieces):
+            if p.startswith(word_start):
+                if offset > prev_end:
+                    words.append(text[prev_end:offset])
+                prev_end = offset
+                w = p.replace(word_start, '')
+            else:
+                w = p
+            try:
+                s = text.index(w, offset)
+                pn = ''
+                k = i + 1
+                while k < len(pieces):
+                    pn = pieces[k].replace(word_start, '')
+                    if len(pn) > 0:
+                        break
+                    k += 1
+
+                if len(pn) > 0 and pn in text[offset:s]:
+                    offset = offset + 1
+                else:
+                    offset = s + len(w)
+            except Exception:
+                offset = offset + 1
+
+        if prev_end < offset:
+            words.append(text[prev_end:offset])
+
+        return words
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def save_pretrained(self, path: str, filename_prefix: str = None):
+        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
+        if filename_prefix is not None:
+            filename = filename_prefix + '-' + filename
+        full_path = os.path.join(path, filename)
+        with open(full_path, 'wb') as fs:
+            fs.write(self.spm.serialized_model_proto())
+        return (full_path, )
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith('C'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
+            cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode('utf-8', 'ignore')
+    else:
+        raise ValueError(f'Unsupported string type: {type(text)}')
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
new file mode 100644
index 00000000..a1fcecf4
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
@@ -0,0 +1,241 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for model DeBERTa."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from transformers.file_utils import is_sentencepiece_available
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+from modelscope.utils import logger as logging
+
+if is_sentencepiece_available():
+    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+else:
+    DebertaV2Tokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'spm.model',
+    'tokenizer_file': 'tokenizer.json'
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece)
+    and [rjieba-py](https://github.com/messense/rjieba-py).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = DebertaV2Tokenizer
+
+    def __init__(self,
+                 vocab_file=None,
+                 tokenizer_file=None,
+                 do_lower_case=False,
+                 split_by_punct=False,
+                 split_chinese=True,
+                 bos_token='[CLS]',
+                 eos_token='[SEP]',
+                 unk_token='[UNK]',
+                 sep_token='[SEP]',
+                 pad_token='[PAD]',
+                 cls_token='[CLS]',
+                 mask_token='[MASK]',
+                 **kwargs) -> None:
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            split_by_punct=split_by_punct,
+            split_chinese=split_chinese,
+            **kwargs,
+        )
+
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.split_chinese = split_chinese
+        self.vocab_file = vocab_file
+        self.can_save_slow_tokenizer = False if not self.vocab_file else True
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self,
+                                token_ids_0,
+                                token_ids_1=None,
+                                already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True)
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + (
+                [0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self,
+                                             token_ids_0,
+                                             token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1
+                                                        + sep) * [1]
+
+    def save_vocabulary(self,
+                        save_directory: str,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+                'tokenizer.')
+
+        if not os.path.isdir(save_directory):
+            logger.error(
+                f'Vocabulary path ({save_directory}) should be a directory')
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + '-' if filename_prefix else '')
+            + VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file, )
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
index 17324be9..4f466c23 100644
--- a/modelscope/models/nlp/masked_language.py
+++ b/modelscope/models/nlp/masked_language.py
@@ -6,6 +6,8 @@ from transformers import BertForMaskedLM as BertForMaskedLMTransformer
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.deberta_v2 import \
+    DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
 from modelscope.models.nlp.structbert import SbertForMaskedLM
 from modelscope.models.nlp.veco import \
     VecoForMaskedLM as VecoForMaskedLMTransformer
@@ -125,3 +127,40 @@ class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
                      VecoForMaskedLM).from_pretrained(
                          pretrained_model_name_or_path=model_dir,
                          model_dir=model_dir)
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
+class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
+    """Deberta v2 for MLM model.
+
+    Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
+    """
+
+    def __init__(self, config, model_dir):
+        super(TorchModel, self).__init__(model_dir)
+        DebertaV2ForMaskedLMTransformer.__init__(self, config)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                labels=None):
+        output = DebertaV2ForMaskedLMTransformer.forward(
+            self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            labels=labels)
+        output[OutputKeys.INPUT_IDS] = input_ids
+        return output
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.get('model_dir')
+        return super(DebertaV2ForMaskedLMTransformer,
+                     DebertaV2ForMaskedLM).from_pretrained(
+                         pretrained_model_name_or_path=model_dir,
+                         model_dir=model_dir)
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 60a9631b..caba4122 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -13,7 +13,10 @@ from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['FillMaskPipeline']
-_type_map = {'veco': 'roberta', 'sbert': 'bert'}
+_type_map = {
+    'veco': 'roberta',
+    'sbert': 'bert',
+}
 
 
 @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
@@ -65,7 +68,7 @@ class FillMaskPipeline(Pipeline):
         self.config = Config.from_file(
             os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
         self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103}
+        self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4}
 
         self.rep_map = {
             'bert': {
@@ -85,7 +88,14 @@ class FillMaskPipeline(Pipeline):
                 '<s>': '',
                 '</s>': '',
                 '<unk>': ' '
-            }
+            },
+            'deberta_v2': {
+                '[PAD]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
         }
 
     def forward(self, inputs: Dict[str, Any],
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 4882c477..825611d6 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -170,6 +170,9 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         elif model_type == Models.veco:
             from modelscope.models.nlp.veco import VecoTokenizer
             return VecoTokenizer.from_pretrained(model_dir)
+        elif model_type == Models.deberta_v2:
+            from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer
+            return DebertaV2Tokenizer.from_pretrained(model_dir)
         else:
             return AutoTokenizer.from_pretrained(model_dir, use_fast=False)
 
diff --git a/tests/pipelines/test_deberta_tasks.py b/tests/pipelines/test_deberta_tasks.py
new file mode 100644
index 00000000..4f3206cd
--- /dev/null
+++ b/tests/pipelines/test_deberta_tasks.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import DebertaV2ForMaskedLM
+from modelscope.models.nlp.deberta_v2 import (DebertaV2Tokenizer,
+                                              DebertaV2TokenizerFast)
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FillMaskPipeline
+from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class DeBERTaV2TaskTest(unittest.TestCase):
+    model_id_deberta = 'damo/nlp_debertav2_fill-mask_chinese-lite'
+
+    ori_text = '你师父差得动你，你师父可差不动我。'
+    test_input = '你师父差得动你，你师父可[MASK]不动我。'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id_deberta)
+        preprocessor = FillMaskPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = DebertaV2ForMaskedLM.from_pretrained(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        ori_text = self.ori_text
+        test_input = self.test_input
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        # sbert
+        print(self.model_id_deberta)
+        model = Model.from_pretrained(self.model_id_deberta)
+        preprocessor = FillMaskPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        print(
+            f'\nori_text: {self.ori_text}\ninput: {self.test_input}\npipeline: '
+            f'{pipeline_ins(self.test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=self.model_id_deberta)
+        ori_text = self.ori_text
+        test_input = self.test_input
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+              f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9e14d6727b7583fed29f0684a1171754a505388d Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Fri, 2 Sep 2022 11:02:43 +0800
Subject: [PATCH 05/28] [to #44571845]fix: ci support multiple image        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9974293

---
 .dev_scripts/ci_container_test.sh |   3 -
 .dev_scripts/dockerci.sh          |   5 +-
 requirements/tensorflow1x.txt     |   1 +
 tests/isolated_cases.txt          |   6 -
 tests/run.py                      | 191 ++++++++++++++++++++----------
 tests/run_config.yaml             |  31 +++++
 6 files changed, 165 insertions(+), 72 deletions(-)
 create mode 100644 requirements/tensorflow1x.txt
 delete mode 100644 tests/isolated_cases.txt
 create mode 100644 tests/run_config.yaml

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 2f18aff7..a53c08c6 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -4,8 +4,6 @@ pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs
 pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/tests.txt
-# install numpy<=1.18 for tensorflow==1.15.x
-pip install "numpy<=1.18"
 
 git config --global --add safe.directory /Maas-lib
 
@@ -26,4 +24,3 @@ else
 fi
 echo "Running case with command: $ci_command"
 $ci_command
-#python tests/run.py --isolated_cases test_text_to_speech.py test_multi_modal_embedding.py test_ofa_tasks.py test_video_summarization.py
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index dbb79514..e76f2f14 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -7,7 +7,8 @@ gpus='7 6 5 4 3 2 1 0'
 cpu_sets='0-7 8-15 16-23 24-30 31-37 38-44 45-51 52-58'
 cpu_sets_arr=($cpu_sets)
 is_get_file_lock=false
-CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_COMMAND}
+# export RUN_CASE_COMMAND='python tests/run.py --run_config tests/run_config.yaml'
+CI_COMMAND=${CI_COMMAND:-bash .dev_scripts/ci_container_test.sh $RUN_CASE_BASE_COMMAND}
 echo "ci command: $CI_COMMAND"
 for gpu in $gpus
 do
@@ -16,6 +17,7 @@ do
   echo "get gpu lock $gpu"
   CONTAINER_NAME="modelscope-ci-$gpu"
   let is_get_file_lock=true
+
   # pull image if there are update
   docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
   docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
@@ -38,6 +40,7 @@ do
              --net host  \
              ${IMAGE_NAME}:${IMAGE_VERSION} \
              $CI_COMMAND
+
   if [ $? -ne 0 ]; then
     echo "Running test case failed, please check the log!"
     exit -1
diff --git a/requirements/tensorflow1x.txt b/requirements/tensorflow1x.txt
new file mode 100644
index 00000000..b139efe1
--- /dev/null
+++ b/requirements/tensorflow1x.txt
@@ -0,0 +1 @@
+numpy==1.18.5
diff --git a/tests/isolated_cases.txt b/tests/isolated_cases.txt
deleted file mode 100644
index be85142a..00000000
--- a/tests/isolated_cases.txt
+++ /dev/null
@@ -1,6 +0,0 @@
- test_text_to_speech.py
- test_multi_modal_embedding.py
- test_ofa_tasks.py
- test_video_summarization.py
- test_dialog_modeling.py
- test_csanmt_translation.py
diff --git a/tests/run.py b/tests/run.py
index 79509745..478cb9d6 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -21,6 +21,7 @@ import pandas
 #         if 'import tensorflow' in front of 'import torch'.
 #         Puting a 'import torch' here can bypass this incompatibility.
 import torch
+import yaml
 
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import set_test_level, test_level
@@ -61,6 +62,7 @@ def statistics_test_result(df):
         result, total_cases, success_cases, failures_cases, error_cases,
         skipped_cases, expected_failure_cases, unexpected_success_cases)
 
+    print('Testing result summary.')
     print(result_msg)
     if result == 'FAILED':
         sys.exit(1)
@@ -88,6 +90,7 @@ def gather_test_suites_files(test_dir, pattern):
         for file in filenames:
             if fnmatch(file, pattern):
                 case_file_list.append(file)
+
     return case_file_list
 
 
@@ -125,18 +128,6 @@ def collect_test_results(case_results):
     return result_list
 
 
-class TestSuiteRunner:
-
-    def run(self, msg_queue, test_dir, test_suite_file):
-        test_suite = unittest.TestSuite()
-        test_case = unittest.defaultTestLoader.discover(
-            start_dir=test_dir, pattern=test_suite_file)
-        test_suite.addTest(test_case)
-        runner = TimeCostTextTestRunner()
-        test_suite_result = runner.run(test_suite)
-        msg_queue.put(collect_test_results(test_suite_result))
-
-
 def run_command_with_popen(cmd):
     with subprocess.Popen(
             cmd,
@@ -148,55 +139,126 @@ def run_command_with_popen(cmd):
             sys.stdout.write(line)
 
 
+def save_test_result(df, args):
+    if args.result_dir is not None:
+        file_name = str(int(datetime.datetime.now().timestamp() * 1000))
+        os.umask(0)
+        Path(args.result_dir).mkdir(mode=0o777, parents=True, exist_ok=True)
+        Path(os.path.join(args.result_dir, file_name)).touch(
+            mode=0o666, exist_ok=True)
+        df.to_pickle(os.path.join(args.result_dir, file_name))
+
+
+def run_command(cmd):
+    logger.info('Running command: %s' % ' '.join(cmd))
+    response = subprocess.run(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        response.check_returncode()
+        logger.info(response.stdout.decode('utf8'))
+    except subprocess.CalledProcessError as error:
+        logger.error(
+            'stdout: %s, stderr: %s' %
+            (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+
+
+def install_packages(pkgs):
+    cmd = [sys.executable, '-m', 'pip', 'install']
+    for pkg in pkgs:
+        cmd.append(pkg)
+
+    run_command(cmd)
+
+
+def install_requirements(requirements):
+    for req in requirements:
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', '-r',
+            'requirements/%s' % req, '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        run_command(cmd)
+
+
+def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
+                    result_dir):
+    # install requirements and deps # run_config['envs'][env]
+    if 'requirements' in env:
+        install_requirements(env['requirements'])
+    if 'dependencies' in env:
+        install_packages(env['dependencies'])
+
+    for test_suite_file in isolated_cases:  # run case in subprocess
+        if test_suite_file in test_suite_env_map and test_suite_env_map[
+                test_suite_file] == env_name:
+            cmd = [
+                'python',
+                'tests/run.py',
+                '--pattern',
+                test_suite_file,
+                '--result_dir',
+                result_dir,
+            ]
+            run_command_with_popen(cmd)
+        else:
+            pass  # case not in run list.
+
+    # run remain cases in a process.
+    remain_suite_files = []
+    for k, v in test_suite_env_map.items():
+        if k not in isolated_cases and v == env_name:
+            remain_suite_files.append(k)
+    if len(remain_suite_files) == 0:
+        return
+    cmd = ['python', 'tests/run.py', '--result_dir', result_dir, '--suites']
+    for suite in remain_suite_files:
+        cmd.append(suite)
+    run_command_with_popen(cmd)
+
+
 def run_in_subprocess(args):
     # only case args.isolated_cases run in subporcess, all other run in a subprocess
     test_suite_files = gather_test_suites_files(
         os.path.abspath(args.test_dir), args.pattern)
+    run_config = None
+    isolated_cases = []
+    test_suite_env_map = {}
+    # put all the case in default env.
+    for test_suite_file in test_suite_files:
+        test_suite_env_map[test_suite_file] = 'default'
+
+    if args.run_config is not None and Path(args.run_config).exists():
+        with open(args.run_config) as f:
+            run_config = yaml.load(f, Loader=yaml.FullLoader)
+        if 'isolated' in run_config:
+            isolated_cases = run_config['isolated']
+
+        if 'envs' in run_config:
+            for env in run_config['envs']:
+                if env != 'default':
+                    for test_suite in run_config['envs'][env]['tests']:
+                        if test_suite in test_suite_env_map:
+                            test_suite_env_map[test_suite] = env
 
     if args.subprocess:  # run all case in subprocess
         isolated_cases = test_suite_files
-    else:
-        isolated_cases = []
-        with open(args.isolated_cases, 'r') as f:
-            for line in f:
-                if line.strip() in test_suite_files:
-                    isolated_cases.append(line.strip())
-
-    if not args.list_tests:
-        with tempfile.TemporaryDirectory() as temp_result_dir:
-            for test_suite_file in isolated_cases:  # run case in subprocess
-                cmd = [
-                    'python', 'tests/run.py', '--pattern', test_suite_file,
-                    '--result_dir', temp_result_dir
-                ]
-                run_command_with_popen(cmd)
-            result_dfs = []
-            # run remain cases in a process.
-            remain_suite_files = [
-                item for item in test_suite_files if item not in isolated_cases
-            ]
-            test_suite = gather_test_suites_in_files(args.test_dir,
-                                                     remain_suite_files,
-                                                     args.list_tests)
-            if test_suite.countTestCases() > 0:
-                runner = TimeCostTextTestRunner()
-                result = runner.run(test_suite)
-                result = collect_test_results(result)
-                df = test_cases_result_to_df(result)
-                result_dfs.append(df)
 
-            # collect test results
-            result_path = Path(temp_result_dir)
-            for result in result_path.iterdir():
-                if Path.is_file(result):
-                    df = pandas.read_pickle(result)
-                    result_dfs.append(df)
+    with tempfile.TemporaryDirectory() as temp_result_dir:
+        for env in set(test_suite_env_map.values()):
+            run_case_in_env(env, run_config['envs'][env], test_suite_env_map,
+                            isolated_cases, temp_result_dir)
 
-            result_pd = pandas.concat(
-                result_dfs)  # merge result of every test suite.
-            print_table_result(result_pd)
-            print_abnormal_case_info(result_pd)
-            statistics_test_result(result_pd)
+        result_dfs = []
+        result_path = Path(temp_result_dir)
+        for result in result_path.iterdir():
+            if Path.is_file(result):
+                df = pandas.read_pickle(result)
+                result_dfs.append(df)
+        result_pd = pandas.concat(
+            result_dfs)  # merge result of every test suite.
+        print_table_result(result_pd)
+        print_abnormal_case_info(result_pd)
+        statistics_test_result(result_pd)
 
 
 def get_object_full_name(obj):
@@ -293,15 +355,19 @@ def print_table_result(df):
 
 def main(args):
     runner = TimeCostTextTestRunner()
-    test_suite = gather_test_cases(
-        os.path.abspath(args.test_dir), args.pattern, args.list_tests)
+    if args.suites is not None and len(args.suites) > 0:
+        logger.info('Running: %s' % ' '.join(args.suites))
+        test_suite = gather_test_suites_in_files(args.test_dir, args.suites,
+                                                 args.list_tests)
+    else:
+        test_suite = gather_test_cases(
+            os.path.abspath(args.test_dir), args.pattern, args.list_tests)
     if not args.list_tests:
         result = runner.run(test_suite)
         result = collect_test_results(result)
         df = test_cases_result_to_df(result)
         if args.result_dir is not None:
-            file_name = str(int(datetime.datetime.now().timestamp() * 1000))
-            df.to_pickle(os.path.join(args.result_dir, file_name))
+            save_test_result(df, args)
         else:
             print_table_result(df)
             print_abnormal_case_info(df)
@@ -321,9 +387,9 @@ if __name__ == '__main__':
     parser.add_argument(
         '--disable_profile', action='store_true', help='disable profiling')
     parser.add_argument(
-        '--isolated_cases',
+        '--run_config',
         default=None,
-        help='specified isolated cases config file')
+        help='specified case run config file(yaml file)')
     parser.add_argument(
         '--subprocess',
         action='store_true',
@@ -332,6 +398,10 @@ if __name__ == '__main__':
         '--result_dir',
         default=None,
         help='Save result to directory, internal use only')
+    parser.add_argument(
+        '--suites',
+        nargs='*',
+        help='Run specified test suites(test suite file list)')
     args = parser.parse_args()
     set_test_level(args.level)
     os.environ['REGRESSION_BASELINE'] = '1'
@@ -340,10 +410,7 @@ if __name__ == '__main__':
         from utils import profiler
         logger.info('enable profile ...')
         profiler.enable()
-    if args.isolated_cases is not None or args.subprocess:
+    if args.run_config is not None or args.subprocess:
         run_in_subprocess(args)
-    elif args.isolated_cases is not None and args.subprocess:
-        print('isolated_cases and subporcess conflict')
-        sys.exit(1)
     else:
         main(args)
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
new file mode 100644
index 00000000..591dcd66
--- /dev/null
+++ b/tests/run_config.yaml
@@ -0,0 +1,31 @@
+# envs option allows fine-grained control for test executoin, for example,
+# python tests/run.py --env pytorch
+# would only trigger exeutions of all pytorch cases.
+# envs option defaults to None for backward compatbility
+isolated:  # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
+  - test_text_to_speech.py
+  - test_multi_modal_embedding.py
+  - test_ofa_tasks.py
+  - test_video_summarization.py
+  - test_dialog_modeling.py
+  - test_csanmt_translation.py
+
+envs:
+  default: # default env, case not in other env will in default, pytorch.
+    dependencies: # requirement packages，pip install before test case run.
+      - numpy>=1.20
+  tensorflow1x: #  cases excuted  tensorflow1.x framework.
+    requirements: # requirements files run before test case run.
+      - tensorflow1x.txt
+    dependencies: # requirement packages，pip install before test case run.
+      - numpy==1.18.5
+    tests:
+      - test_text_to_speech.py
+      - test_csanmt_translation.py
+      - test_translation_trainer.py
+      - test_ocr_detection.py
+      - test_automatic_speech_recognition.py
+      - test_image_matting.py
+      - test_person_image_cartoon.py
+      - test_skin_retouching.py
+      - test_image_style_transfer.py

From 1bac4f3349cbd1c343f4fbe1d9ec80198afd1a32 Mon Sep 17 00:00:00 2001
From: "xianzhe.xxz" <xianzhe.xxz@alibaba-inc.com>
Date: Fri, 2 Sep 2022 13:10:31 +0800
Subject: [PATCH 06/28] [to #42322933]add tinynas-detection pipeline and models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

接入tinynas-detection，新增tinynas object detection pipeline以及tinynas models。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9938220
---
 modelscope/metainfo.py                        |   3 +
 .../models/cv/tinynas_detection/__init__.py   |  24 +
 .../cv/tinynas_detection/backbone/__init__.py |  16 +
 .../cv/tinynas_detection/backbone/darknet.py  | 126 ++++
 .../cv/tinynas_detection/backbone/tinynas.py  | 347 +++++++++
 .../cv/tinynas_detection/core/__init__.py     |   2 +
 .../cv/tinynas_detection/core/base_ops.py     | 474 +++++++++++++
 .../cv/tinynas_detection/core/neck_ops.py     | 324 +++++++++
 .../cv/tinynas_detection/core/repvgg_block.py | 205 ++++++
 .../models/cv/tinynas_detection/core/utils.py | 196 ++++++
 .../models/cv/tinynas_detection/detector.py   | 181 +++++
 .../cv/tinynas_detection/head/__init__.py     |  16 +
 .../tinynas_detection/head/gfocal_v2_tiny.py  | 361 ++++++++++
 .../cv/tinynas_detection/neck/__init__.py     |  16 +
 .../tinynas_detection/neck/giraffe_config.py  | 235 +++++++
 .../cv/tinynas_detection/neck/giraffe_fpn.py  | 661 ++++++++++++++++++
 .../tinynas_detection/neck/giraffe_fpn_v2.py  | 203 ++++++
 .../cv/tinynas_detection/tinynas_detector.py  |  16 +
 .../models/cv/tinynas_detection/utils.py      |  30 +
 .../cv/tinynas_detection_pipeline.py          |  61 ++
 tests/pipelines/test_tinynas_detection.py     |  20 +
 21 files changed, 3517 insertions(+)
 create mode 100644 modelscope/models/cv/tinynas_detection/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/backbone/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/backbone/darknet.py
 create mode 100755 modelscope/models/cv/tinynas_detection/backbone/tinynas.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/base_ops.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/neck_ops.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/repvgg_block.py
 create mode 100644 modelscope/models/cv/tinynas_detection/core/utils.py
 create mode 100644 modelscope/models/cv/tinynas_detection/detector.py
 create mode 100644 modelscope/models/cv/tinynas_detection/head/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/__init__.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
 create mode 100644 modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
 create mode 100644 modelscope/models/cv/tinynas_detection/tinynas_detector.py
 create mode 100644 modelscope/models/cv/tinynas_detection/utils.py
 create mode 100644 modelscope/pipelines/cv/tinynas_detection_pipeline.py
 create mode 100644 tests/pipelines/test_tinynas_detection.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 971dd3f1..fd653bac 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -9,6 +9,8 @@ class Models(object):
 
         Model name should only contain model info but not task info.
     """
+    tinynas_detection = 'tinynas-detection'
+
     # vision models
     detection = 'detection'
     realtime_object_detection = 'realtime-object-detection'
@@ -133,6 +135,7 @@ class Pipelines(object):
     image_to_image_generation = 'image-to-image-generation'
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
+    tinynas_detection = 'tinynas-detection'
     crowd_counting = 'hrnet-crowd-counting'
     action_detection = 'ResNetC3D-action-detection'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
diff --git a/modelscope/models/cv/tinynas_detection/__init__.py b/modelscope/models/cv/tinynas_detection/__init__.py
new file mode 100644
index 00000000..13532d10
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .tinynas_detector import Tinynas_detector
+
+else:
+    _import_structure = {
+        'tinynas_detector': ['TinynasDetector'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/tinynas_detection/backbone/__init__.py b/modelscope/models/cv/tinynas_detection/backbone/__init__.py
new file mode 100644
index 00000000..186d06a3
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .darknet import CSPDarknet
+from .tinynas import load_tinynas_net
+
+
+def build_backbone(cfg):
+    backbone_cfg = copy.deepcopy(cfg)
+    name = backbone_cfg.pop('name')
+    if name == 'CSPDarknet':
+        return CSPDarknet(**backbone_cfg)
+    elif name == 'TinyNAS':
+        return load_tinynas_net(backbone_cfg)
diff --git a/modelscope/models/cv/tinynas_detection/backbone/darknet.py b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
new file mode 100644
index 00000000..d3294f0d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/darknet.py
@@ -0,0 +1,126 @@
+# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+from torch import nn
+
+from ..core.base_ops import (BaseConv, CSPLayer, DWConv, Focus, ResLayer,
+                             SPPBottleneck)
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(
+        self,
+        dep_mul,
+        wid_mul,
+        out_features=('dark3', 'dark4', 'dark5'),
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        super(CSPDarknet, self).__init__()
+        assert out_features, 'please provide output features of Darknet'
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        # self.stem = Focus(3, base_channels, ksize=3, act=act)
+        self.stem = Focus(3, base_channels, 3, act=act)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 2,
+                base_channels * 2,
+                n=base_depth,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 4,
+                base_channels * 4,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+            CSPLayer(
+                base_channels * 8,
+                base_channels * 8,
+                n=base_depth * 3,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+            SPPBottleneck(
+                base_channels * 16, base_channels * 16, activation=act),
+            CSPLayer(
+                base_channels * 16,
+                base_channels * 16,
+                n=base_depth,
+                shortcut=False,
+                depthwise=depthwise,
+                act=act,
+                reparam=reparam,
+            ),
+        )
+
+    def init_weights(self, pretrain=None):
+
+        if pretrain is None:
+            return
+        else:
+            pretrained_dict = torch.load(
+                pretrain, map_location='cpu')['state_dict']
+            new_params = self.state_dict().copy()
+            for k, v in pretrained_dict.items():
+                ks = k.split('.')
+                if ks[0] == 'fc' or ks[-1] == 'total_ops' or ks[
+                        -1] == 'total_params':
+                    continue
+                else:
+                    new_params[k] = v
+
+            self.load_state_dict(new_params)
+            print(f' load pretrain backbone from {pretrain}')
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs['stem'] = x
+        x = self.dark2(x)
+        outputs['dark2'] = x
+        x = self.dark3(x)
+        outputs['dark3'] = x
+        x = self.dark4(x)
+        outputs['dark4'] = x
+        x = self.dark5(x)
+        outputs['dark5'] = x
+        features_out = [
+            outputs['stem'], outputs['dark2'], outputs['dark3'],
+            outputs['dark4'], outputs['dark5']
+        ]
+
+        return features_out
diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
new file mode 100755
index 00000000..814ee550
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
@@ -0,0 +1,347 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+import torch.nn as nn
+
+from ..core.base_ops import Focus, SPPBottleneck, get_activation
+from ..core.repvgg_block import RepVggBlock
+
+
+class ConvKXBN(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel_size, stride):
+        super(ConvKXBN, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_c,
+            out_c,
+            kernel_size,
+            stride, (kernel_size - 1) // 2,
+            groups=1,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(out_c)
+
+    def forward(self, x):
+        return self.bn1(self.conv1(x))
+
+
+class ConvKXBNRELU(nn.Module):
+
+    def __init__(self, in_c, out_c, kernel_size, stride, act='silu'):
+        super(ConvKXBNRELU, self).__init__()
+        self.conv = ConvKXBN(in_c, out_c, kernel_size, stride)
+        if act is None:
+            self.activation_function = torch.relu
+        else:
+            self.activation_function = get_activation(act)
+
+    def forward(self, x):
+        output = self.conv(x)
+        return self.activation_function(output)
+
+
+class ResConvK1KX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 force_resproj=False,
+                 act='silu'):
+        super(ResConvK1KX, self).__init__()
+        self.stride = stride
+        self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
+        self.conv2 = RepVggBlock(
+            btn_c, out_c, kernel_size, stride, act='identity')
+
+        if act is None:
+            self.activation_function = torch.relu
+        else:
+            self.activation_function = get_activation(act)
+
+        if stride == 2:
+            self.residual_downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+        else:
+            self.residual_downsample = nn.Identity()
+
+        if in_c != out_c or force_resproj:
+            self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
+        else:
+            self.residual_proj = nn.Identity()
+
+    def forward(self, x):
+        if self.stride != 2:
+            reslink = self.residual_downsample(x)
+            reslink = self.residual_proj(reslink)
+
+        output = x
+        output = self.conv1(output)
+        output = self.activation_function(output)
+        output = self.conv2(output)
+        if self.stride != 2:
+            output = output + reslink
+        output = self.activation_function(output)
+
+        return output
+
+
+class SuperResConvK1KX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 num_blocks,
+                 with_spp=False,
+                 act='silu'):
+        super(SuperResConvK1KX, self).__init__()
+        if act is None:
+            self.act = torch.relu
+        else:
+            self.act = get_activation(act)
+        self.block_list = nn.ModuleList()
+        for block_id in range(num_blocks):
+            if block_id == 0:
+                in_channels = in_c
+                out_channels = out_c
+                this_stride = stride
+                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
+                this_kernel_size = kernel_size
+            else:
+                in_channels = out_c
+                out_channels = out_c
+                this_stride = 1
+                force_resproj = False
+                this_kernel_size = kernel_size
+            the_block = ResConvK1KX(
+                in_channels,
+                out_channels,
+                btn_c,
+                this_kernel_size,
+                this_stride,
+                force_resproj,
+                act=act)
+            self.block_list.append(the_block)
+            if block_id == 0 and with_spp:
+                self.block_list.append(
+                    SPPBottleneck(out_channels, out_channels))
+
+    def forward(self, x):
+        output = x
+        for block in self.block_list:
+            output = block(output)
+        return output
+
+
+class ResConvKXKX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 force_resproj=False,
+                 act='silu'):
+        super(ResConvKXKX, self).__init__()
+        self.stride = stride
+        if self.stride == 2:
+            self.downsampler = ConvKXBNRELU(in_c, out_c, 3, 2, act=act)
+        else:
+            self.conv1 = ConvKXBN(in_c, btn_c, kernel_size, 1)
+            self.conv2 = RepVggBlock(
+                btn_c, out_c, kernel_size, stride, act='identity')
+
+            if act is None:
+                self.activation_function = torch.relu
+            else:
+                self.activation_function = get_activation(act)
+
+            if stride == 2:
+                self.residual_downsample = nn.AvgPool2d(
+                    kernel_size=2, stride=2)
+            else:
+                self.residual_downsample = nn.Identity()
+
+            if in_c != out_c or force_resproj:
+                self.residual_proj = ConvKXBN(in_c, out_c, 1, 1)
+            else:
+                self.residual_proj = nn.Identity()
+
+    def forward(self, x):
+        if self.stride == 2:
+            return self.downsampler(x)
+        reslink = self.residual_downsample(x)
+        reslink = self.residual_proj(reslink)
+
+        output = x
+        output = self.conv1(output)
+        output = self.activation_function(output)
+        output = self.conv2(output)
+
+        output = output + reslink
+        output = self.activation_function(output)
+
+        return output
+
+
+class SuperResConvKXKX(nn.Module):
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 btn_c,
+                 kernel_size,
+                 stride,
+                 num_blocks,
+                 with_spp=False,
+                 act='silu'):
+        super(SuperResConvKXKX, self).__init__()
+        if act is None:
+            self.act = torch.relu
+        else:
+            self.act = get_activation(act)
+        self.block_list = nn.ModuleList()
+        for block_id in range(num_blocks):
+            if block_id == 0:
+                in_channels = in_c
+                out_channels = out_c
+                this_stride = stride
+                force_resproj = False  # as a part of CSPLayer, DO NOT need this flag
+                this_kernel_size = kernel_size
+            else:
+                in_channels = out_c
+                out_channels = out_c
+                this_stride = 1
+                force_resproj = False
+                this_kernel_size = kernel_size
+            the_block = ResConvKXKX(
+                in_channels,
+                out_channels,
+                btn_c,
+                this_kernel_size,
+                this_stride,
+                force_resproj,
+                act=act)
+            self.block_list.append(the_block)
+            if block_id == 0 and with_spp:
+                self.block_list.append(
+                    SPPBottleneck(out_channels, out_channels))
+
+    def forward(self, x):
+        output = x
+        for block in self.block_list:
+            output = block(output)
+        return output
+
+
+class TinyNAS(nn.Module):
+
+    def __init__(self,
+                 structure_info=None,
+                 out_indices=[0, 1, 2, 4, 5],
+                 out_channels=[None, None, 128, 256, 512],
+                 with_spp=False,
+                 use_focus=False,
+                 need_conv1=True,
+                 act='silu'):
+        super(TinyNAS, self).__init__()
+        assert len(out_indices) == len(out_channels)
+        self.out_indices = out_indices
+        self.need_conv1 = need_conv1
+
+        self.block_list = nn.ModuleList()
+        if need_conv1:
+            self.conv1_list = nn.ModuleList()
+        for idx, block_info in enumerate(structure_info):
+            the_block_class = block_info['class']
+            if the_block_class == 'ConvKXBNRELU':
+                if use_focus:
+                    the_block = Focus(block_info['in'], block_info['out'],
+                                      block_info['k'])
+                else:
+                    the_block = ConvKXBNRELU(
+                        block_info['in'],
+                        block_info['out'],
+                        block_info['k'],
+                        block_info['s'],
+                        act=act)
+                self.block_list.append(the_block)
+            elif the_block_class == 'SuperResConvK1KX':
+                spp = with_spp if idx == len(structure_info) - 1 else False
+                the_block = SuperResConvK1KX(
+                    block_info['in'],
+                    block_info['out'],
+                    block_info['btn'],
+                    block_info['k'],
+                    block_info['s'],
+                    block_info['L'],
+                    spp,
+                    act=act)
+                self.block_list.append(the_block)
+            elif the_block_class == 'SuperResConvKXKX':
+                spp = with_spp if idx == len(structure_info) - 1 else False
+                the_block = SuperResConvKXKX(
+                    block_info['in'],
+                    block_info['out'],
+                    block_info['btn'],
+                    block_info['k'],
+                    block_info['s'],
+                    block_info['L'],
+                    spp,
+                    act=act)
+                self.block_list.append(the_block)
+            if need_conv1:
+                if idx in self.out_indices and out_channels[
+                        self.out_indices.index(idx)] is not None:
+                    self.conv1_list.append(
+                        nn.Conv2d(block_info['out'],
+                                  out_channels[self.out_indices.index(idx)],
+                                  1))
+                else:
+                    self.conv1_list.append(None)
+
+    def init_weights(self, pretrain=None):
+        pass
+
+    def forward(self, x):
+        output = x
+        stage_feature_list = []
+        for idx, block in enumerate(self.block_list):
+            output = block(output)
+            if idx in self.out_indices:
+                if self.need_conv1 and self.conv1_list[idx] is not None:
+                    true_out = self.conv1_list[idx](output)
+                    stage_feature_list.append(true_out)
+                else:
+                    stage_feature_list.append(output)
+        return stage_feature_list
+
+
+def load_tinynas_net(backbone_cfg):
+    # load masternet model to path
+    import ast
+
+    struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str])
+    struct_info = ast.literal_eval(struct_str)
+    for layer in struct_info:
+        if 'nbitsA' in layer:
+            del layer['nbitsA']
+        if 'nbitsW' in layer:
+            del layer['nbitsW']
+
+    model = TinyNAS(
+        structure_info=struct_info,
+        out_indices=backbone_cfg.out_indices,
+        out_channels=backbone_cfg.out_channels,
+        with_spp=backbone_cfg.with_spp,
+        use_focus=backbone_cfg.use_focus,
+        act=backbone_cfg.act,
+        need_conv1=backbone_cfg.need_conv1,
+    )
+
+    return model
diff --git a/modelscope/models/cv/tinynas_detection/core/__init__.py b/modelscope/models/cv/tinynas_detection/core/__init__.py
new file mode 100644
index 00000000..3dad5e72
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
diff --git a/modelscope/models/cv/tinynas_detection/core/base_ops.py b/modelscope/models/cv/tinynas_detection/core/base_ops.py
new file mode 100644
index 00000000..62729ca2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/base_ops.py
@@ -0,0 +1,474 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .repvgg_block import RepVggBlock
+
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    elif name == 'relu':
+        module = nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+def get_norm(name, out_channels, inplace=True):
+    if name == 'bn':
+        module = nn.BatchNorm2d(out_channels)
+    elif name == 'gn':
+        module = nn.GroupNorm(num_channels=out_channels, num_groups=32)
+    return module
+
+
+class BaseConv(nn.Module):
+    """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 groups=1,
+                 bias=False,
+                 act='silu',
+                 norm='bn'):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        if norm is not None:
+            self.bn = get_norm(norm, out_channels, inplace=True)
+        if act is not None:
+            self.act = get_activation(act, inplace=True)
+        self.with_norm = norm is not None
+        self.with_act = act is not None
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.with_norm:
+            # x = self.norm(x)
+            x = self.bn(x)
+        if self.with_act:
+            x = self.act(x)
+        return x
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DepthWiseConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 groups=None,
+                 bias=False,
+                 act='silu',
+                 norm='bn'):
+        super().__init__()
+        padding = (ksize - 1) // 2
+        self.depthwise = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=padding,
+            groups=in_channels,
+            bias=bias,
+        )
+
+        self.pointwise = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias)
+        if norm is not None:
+            self.dwnorm = get_norm(norm, in_channels, inplace=True)
+            self.pwnorm = get_norm(norm, out_channels, inplace=True)
+        if act is not None:
+            self.act = get_activation(act, inplace=True)
+
+        self.with_norm = norm is not None
+        self.with_act = act is not None
+        self.order = ['depthwise', 'dwnorm', 'pointwise', 'act']
+
+    def forward(self, x):
+
+        for layer_name in self.order:
+            layer = self.__getattr__(layer_name)
+            if layer is not None:
+                x = layer(x)
+        return x
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act='silu'):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            act=act,
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1, stride=1, groups=1, act=act)
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        k_conv1 = 3 if reparam else 1
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, k_conv1, stride=1, act=act)
+        if reparam:
+            self.conv2 = RepVggBlock(
+                hidden_channels, out_channels, 3, stride=1, act=act)
+        else:
+            self.conv2 = Conv(
+                hidden_channels, out_channels, 3, stride=1, act=act)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    'Residual layer with `in_channels` inputs.'
+
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(
+            in_channels, mid_channels, ksize=1, stride=1, act='lrelu')
+        self.layer2 = BaseConv(
+            mid_channels, in_channels, ksize=3, stride=1, act='lrelu')
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 activation='silu'):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+        reparam=False,
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv3 = BaseConv(
+            2 * hidden_channels, out_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act,
+                reparam=reparam) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super().__init__()
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class fast_Focus(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=1,
+                 stride=1,
+                 act='silu'):
+        super(Focus, self).__init__()
+        self.conv1 = self.focus_conv(w1=1.0)
+        self.conv2 = self.focus_conv(w3=1.0)
+        self.conv3 = self.focus_conv(w2=1.0)
+        self.conv4 = self.focus_conv(w4=1.0)
+
+        self.conv = BaseConv(
+            in_channels * 4, out_channels, ksize, stride, act=act)
+
+    def forward(self, x):
+        return self.conv(
+            torch.cat(
+                [self.conv1(x),
+                 self.conv2(x),
+                 self.conv3(x),
+                 self.conv4(x)], 1))
+
+    def focus_conv(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
+        conv = nn.Conv2d(3, 3, 2, 2, groups=3, bias=False)
+        conv.weight = self.init_weights_constant(w1, w2, w3, w4)
+        conv.weight.requires_grad = False
+        return conv
+
+    def init_weights_constant(self, w1=0.0, w2=0.0, w3=0.0, w4=0.0):
+        return nn.Parameter(
+            torch.tensor([[[[w1, w2], [w3, w4]]], [[[w1, w2], [w3, w4]]],
+                          [[[w1, w2], [w3, w4]]]]))
+
+
+# shufflenet block
+def channel_shuffle(x, groups=2):
+    bat_size, channels, w, h = x.shape
+    group_c = channels // groups
+    x = x.view(bat_size, groups, group_c, w, h)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(bat_size, -1, w, h)
+    return x
+
+
+def conv_1x1_bn(in_c, out_c, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(in_c, out_c, 1, stride, 0, bias=False),
+        nn.BatchNorm2d(out_c), nn.ReLU(True))
+
+
+def conv_bn(in_c, out_c, stride=2):
+    return nn.Sequential(
+        nn.Conv2d(in_c, out_c, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(out_c), nn.ReLU(True))
+
+
+class ShuffleBlock(nn.Module):
+
+    def __init__(self, in_c, out_c, downsample=False):
+        super(ShuffleBlock, self).__init__()
+        self.downsample = downsample
+        half_c = out_c // 2
+        if downsample:
+            self.branch1 = nn.Sequential(
+                # 3*3 dw conv, stride = 2
+                # nn.Conv2d(in_c, in_c, 3, 2, 1, groups=in_c, bias=False),
+                nn.Conv2d(in_c, in_c, 3, 1, 1, groups=in_c, bias=False),
+                nn.BatchNorm2d(in_c),
+                # 1*1 pw conv
+                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+
+            self.branch2 = nn.Sequential(
+                # 1*1 pw conv
+                nn.Conv2d(in_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True),
+                # 3*3 dw conv, stride = 2
+                # nn.Conv2d(half_c, half_c, 3, 2, 1, groups=half_c, bias=False),
+                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
+                nn.BatchNorm2d(half_c),
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+        else:
+            # in_c = out_c
+            assert in_c == out_c
+
+            self.branch2 = nn.Sequential(
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True),
+                # 3*3 dw conv, stride = 1
+                nn.Conv2d(half_c, half_c, 3, 1, 1, groups=half_c, bias=False),
+                nn.BatchNorm2d(half_c),
+                # 1*1 pw conv
+                nn.Conv2d(half_c, half_c, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(half_c),
+                nn.ReLU(True))
+
+    def forward(self, x):
+        out = None
+        if self.downsample:
+            # if it is downsampling, we don't need to do channel split
+            out = torch.cat((self.branch1(x), self.branch2(x)), 1)
+        else:
+            # channel split
+            channels = x.shape[1]
+            c = channels // 2
+            x1 = x[:, :c, :, :]
+            x2 = x[:, c:, :, :]
+            out = torch.cat((x1, self.branch2(x2)), 1)
+        return channel_shuffle(out, 2)
+
+
+class ShuffleCSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        n=1,
+        shortcut=True,
+        expansion=0.5,
+        depthwise=False,
+        act='silu',
+    ):
+        """
+        Args:
+            in_channels (int): input channels.
+            out_channels (int): output channels.
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, 1, stride=1, act=act)
+        module_list = [
+            Bottleneck(
+                hidden_channels,
+                hidden_channels,
+                shortcut,
+                1.0,
+                depthwise,
+                act=act) for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        # add channel shuffle
+        return channel_shuffle(x, 2)
diff --git a/modelscope/models/cv/tinynas_detection/core/neck_ops.py b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
new file mode 100644
index 00000000..7f481665
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/neck_ops.py
@@ -0,0 +1,324 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Swish(nn.Module):
+
+    def __init__(self, inplace=True):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        if self.inplace:
+            x.mul_(F.sigmoid(x))
+            return x
+        else:
+            return x * F.sigmoid(x)
+
+
+def get_activation(name='silu', inplace=True):
+    if name is None:
+        return nn.Identity()
+
+    if isinstance(name, str):
+        if name == 'silu':
+            module = nn.SiLU(inplace=inplace)
+        elif name == 'relu':
+            module = nn.ReLU(inplace=inplace)
+        elif name == 'lrelu':
+            module = nn.LeakyReLU(0.1, inplace=inplace)
+        elif name == 'swish':
+            module = Swish(inplace=inplace)
+        elif name == 'hardsigmoid':
+            module = nn.Hardsigmoid(inplace=inplace)
+        else:
+            raise AttributeError('Unsupported act type: {}'.format(name))
+        return module
+    elif isinstance(name, nn.Module):
+        return name
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+
+
+class ConvBNLayer(nn.Module):
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False)
+        self.bn = nn.BatchNorm2d(ch_out, )
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVGGBlock(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', deploy=False):
+        super(RepVGGBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.deploy = deploy
+        self.in_channels = ch_in
+        self.groups = 1
+        if self.deploy is False:
+            self.rbr_dense = ConvBNLayer(
+                ch_in, ch_out, 3, stride=1, padding=1, act=None)
+            self.rbr_1x1 = ConvBNLayer(
+                ch_in, ch_out, 1, stride=1, padding=0, act=None)
+            # self.rbr_identity = nn.BatchNorm2d(num_features=ch_in) if ch_out == ch_in else None
+            self.rbr_identity = None
+        else:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        self.act = get_activation(act) if act is None or isinstance(
+            act, (str, dict)) else act
+
+    def forward(self, x):
+        if self.deploy:
+            print('----------deploy----------')
+            y = self.rbr_reparam(x)
+        else:
+            if self.rbr_identity is None:
+                y = self.rbr_dense(x) + self.rbr_1x1(x)
+            else:
+                y = self.rbr_dense(x) + self.rbr_1x1(x) + self.rbr_identity(x)
+
+        y = self.act(y)
+        return y
+
+    def switch_to_deploy(self):
+        print('switch')
+        if not hasattr(self, 'rbr_reparam'):
+            # return
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        print('switch')
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        # self.__delattr__(self.rbr_dense)
+        # self.__delattr__(self.rbr_1x1)
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        # if isinstance(branch, nn.Sequential):
+        if isinstance(branch, ConvBNLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        # y = self.conv1(x)
+        y = self.conv2(x)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class BasicBlock_3x3(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock_3x3, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class BasicBlock_3x3_Reverse(nn.Module):
+
+    def __init__(self, ch_in, ch_out, act='relu', shortcut=True):
+        super(BasicBlock_3x3_Reverse, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        # self.conv1 = ConvBNLayer(ch_in, ch_out, 1, stride=1, padding=0, act=act)
+        self.conv2 = RepVGGBlock(ch_in, ch_out, act=act)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv2(x)
+        y = self.conv1(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class SPP(nn.Module):
+
+    def __init__(
+        self,
+        ch_in,
+        ch_out,
+        k,
+        pool_size,
+        act='swish',
+    ):
+        super(SPP, self).__init__()
+        self.pool = []
+        for i, size in enumerate(pool_size):
+            pool = nn.MaxPool2d(
+                kernel_size=size, stride=1, padding=size // 2, ceil_mode=False)
+            self.add_module('pool{}'.format(i), pool)
+            self.pool.append(pool)
+        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)
+
+    def forward(self, x):
+        outs = [x]
+
+        for pool in self.pool:
+            outs.append(pool(x))
+        y = torch.cat(outs, axis=1)
+
+        y = self.conv(y)
+        return y
+
+
+class CSPStage(nn.Module):
+
+    def __init__(self, block_fn, ch_in, ch_out, n, act='swish', spp=False):
+        super(CSPStage, self).__init__()
+
+        ch_mid = int(ch_out // 2)
+        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        # self.conv2 = ConvBNLayer(ch_in, ch_mid, 3, stride=1, padding=1, act=act)
+        self.convs = nn.Sequential()
+
+        next_ch_in = ch_mid
+        for i in range(n):
+            if block_fn == 'BasicBlock':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock(next_ch_in, ch_mid, act=act, shortcut=False))
+            elif block_fn == 'BasicBlock_3x3':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock_3x3(next_ch_in, ch_mid, act=act, shortcut=True))
+            elif block_fn == 'BasicBlock_3x3_Reverse':
+                self.convs.add_module(
+                    str(i),
+                    BasicBlock_3x3_Reverse(
+                        next_ch_in, ch_mid, act=act, shortcut=True))
+            else:
+                raise NotImplementedError
+            if i == (n - 1) // 2 and spp:
+                self.convs.add_module(
+                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
+            next_ch_in = ch_mid
+        # self.convs = nn.Sequential(*convs)
+        self.conv3 = ConvBNLayer(ch_mid * (n + 1), ch_out, 1, act=act)
+
+    def forward(self, x):
+        y1 = self.conv1(x)
+        y2 = self.conv2(x)
+
+        mid_out = [y1]
+        for conv in self.convs:
+            y2 = conv(y2)
+            mid_out.append(y2)
+        y = torch.cat(mid_out, axis=1)
+        y = self.conv3(y)
+        return y
diff --git a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
new file mode 100644
index 00000000..06966a4e
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/repvgg_block.py
@@ -0,0 +1,205 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+
+def get_activation(name='silu', inplace=True):
+    if name == 'silu':
+        module = nn.SiLU(inplace=inplace)
+    elif name == 'relu':
+        module = nn.ReLU(inplace=inplace)
+    elif name == 'lrelu':
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    elif name == 'identity':
+        module = nn.Identity()
+    else:
+        raise AttributeError('Unsupported act type: {}'.format(name))
+    return module
+
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    '''Basic cell for rep-style block, including conv and bn'''
+    result = nn.Sequential()
+    result.add_module(
+        'conv',
+        nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+
+class RepVggBlock(nn.Module):
+    '''RepVggBlock is a basic rep-style block, including training and deploy status
+    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    '''
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 deploy=False,
+                 use_se=False,
+                 act='relu',
+                 norm=None):
+        super(RepVggBlock, self).__init__()
+        """ Initialization of the class.
+        Args:
+            in_channels (int): Number of channels in the input image
+            out_channels (int): Number of channels produced by the convolution
+            kernel_size (int or tuple): Size of the convolving kernel
+            stride (int or tuple, optional): Stride of the convolution. Default: 1
+            padding (int or tuple, optional): Zero-padding added to both sides of
+                the input. Default: 1
+            dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+            groups (int, optional): Number of blocked connections from input
+                channels to output channels. Default: 1
+            padding_mode (string, optional): Default: 'zeros'
+            deploy: Whether to be deploy status or training status. Default: False
+            use_se: Whether to use se. Default: False
+        """
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        if isinstance(act, str):
+            self.nonlinearity = get_activation(act)
+        else:
+            self.nonlinearity = act
+
+        if use_se:
+            raise NotImplementedError('se block not supported yet')
+        else:
+            self.se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=True,
+                padding_mode=padding_mode)
+
+        else:
+            self.rbr_identity = None
+            self.rbr_dense = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups)
+            self.rbr_1x1 = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=padding_11,
+                groups=groups)
+
+    def forward(self, inputs):
+        '''Forward process'''
+        if hasattr(self, 'rbr_reparam'):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        return self.nonlinearity(
+            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(
+            in_channels=self.rbr_dense.conv.in_channels,
+            out_channels=self.rbr_dense.conv.out_channels,
+            kernel_size=self.rbr_dense.conv.kernel_size,
+            stride=self.rbr_dense.conv.stride,
+            padding=self.rbr_dense.conv.padding,
+            dilation=self.rbr_dense.conv.dilation,
+            groups=self.rbr_dense.conv.groups,
+            bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
diff --git a/modelscope/models/cv/tinynas_detection/core/utils.py b/modelscope/models/cv/tinynas_detection/core/utils.py
new file mode 100644
index 00000000..482f12fb
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/core/utils.py
@@ -0,0 +1,196 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import numpy as np
+import torch
+import torchvision
+
+__all__ = [
+    'filter_box',
+    'postprocess_airdet',
+    'bboxes_iou',
+    'matrix_iou',
+    'adjust_box_anns',
+    'xyxy2xywh',
+    'xyxy2cxcywh',
+]
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def filter_box(output, scale_range):
+    """
+    output: (N, 5+class) shape
+    """
+    min_scale, max_scale = scale_range
+    w = output[:, 2] - output[:, 0]
+    h = output[:, 3] - output[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return output[keep]
+
+
+def filter_results(boxlist, num_classes, nms_thre):
+    boxes = boxlist.bbox
+    scores = boxlist.get_field('scores')
+    cls = boxlist.get_field('labels')
+    nms_out_index = torchvision.ops.batched_nms(
+        boxes,
+        scores,
+        cls,
+        nms_thre,
+    )
+    boxlist = boxlist[nms_out_index]
+
+    return boxlist
+
+
+def postprocess_airdet(prediction,
+                       num_classes,
+                       conf_thre=0.7,
+                       nms_thre=0.45,
+                       imgs=None):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        multi_bboxes = image_pred[:, :4]
+        multi_scores = image_pred[:, 5:]
+        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
+                                                    conf_thre, nms_thre, 500)
+        detections = torch.cat(
+            (detections, scores[:, None], scores[:, None], labels[:, None]),
+            dim=1)
+
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+    return bbox
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
diff --git a/modelscope/models/cv/tinynas_detection/detector.py b/modelscope/models/cv/tinynas_detection/detector.py
new file mode 100644
index 00000000..615b13a8
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import os.path as osp
+import pickle
+
+import cv2
+import torch
+import torchvision
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import build_backbone
+from .head import build_head
+from .neck import build_neck
+from .utils import parse_config
+
+
+class SingleStageDetector(TorchModel):
+    """
+    The base class of single stage detector.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+        init model by cfg
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        config_path = osp.join(model_dir, 'airdet_s.py')
+        config = parse_config(config_path)
+        self.cfg = config
+        model_path = osp.join(model_dir, config.model.name)
+        label_map = osp.join(model_dir, config.model.class_map)
+        self.label_map = pickle.load(open(label_map, 'rb'))
+        self.size_divisible = config.dataset.size_divisibility
+        self.num_classes = config.model.head.num_classes
+        self.conf_thre = config.model.head.nms_conf_thre
+        self.nms_thre = config.model.head.nms_iou_thre
+
+        self.backbone = build_backbone(self.cfg.model.backbone)
+        self.neck = build_neck(self.cfg.model.neck)
+        self.head = build_head(self.cfg.model.head)
+
+        self.load_pretrain_model(model_path)
+
+    def load_pretrain_model(self, pretrain_model):
+
+        state_dict = torch.load(pretrain_model, map_location='cpu')['model']
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            k = k.replace('module.', '')
+            new_state_dict[k] = v
+        self.load_state_dict(new_state_dict, strict=True)
+
+    def inference(self, x):
+
+        if self.training:
+            return self.forward_train(x)
+        else:
+            return self.forward_eval(x)
+
+    def forward_train(self, x):
+
+        pass
+
+    def forward_eval(self, x):
+
+        x = self.backbone(x)
+        x = self.neck(x)
+        prediction = self.head(x)
+
+        return prediction
+
+    def preprocess(self, image):
+        image = torch.from_numpy(image).type(torch.float32)
+        image = image.permute(2, 0, 1)
+        shape = image.shape  # c, h, w
+        if self.size_divisible > 0:
+            import math
+            stride = self.size_divisible
+            shape = list(shape)
+            shape[1] = int(math.ceil(shape[1] / stride) * stride)
+            shape[2] = int(math.ceil(shape[2] / stride) * stride)
+            shape = tuple(shape)
+        pad_img = image.new(*shape).zero_()
+        pad_img[:, :image.shape[1], :image.shape[2]].copy_(image)
+        pad_img = pad_img.unsqueeze(0)
+
+        return pad_img
+
+    def postprocess(self, preds):
+        bboxes, scores, labels_idx = postprocess_gfocal(
+            preds, self.num_classes, self.conf_thre, self.nms_thre)
+        bboxes = bboxes.cpu().numpy()
+        scores = scores.cpu().numpy()
+        labels_idx = labels_idx.cpu().numpy()
+        labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx]
+
+        return (bboxes, scores, labels)
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def postprocess_gfocal(prediction, num_classes, conf_thre=0.05, nms_thre=0.7):
+    assert prediction.shape[0] == 1
+    for i, image_pred in enumerate(prediction):
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        multi_bboxes = image_pred[:, :4]
+        multi_scores = image_pred[:, 4:]
+        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
+                                                    conf_thre, nms_thre, 500)
+
+    return detections, scores, labels
diff --git a/modelscope/models/cv/tinynas_detection/head/__init__.py b/modelscope/models/cv/tinynas_detection/head/__init__.py
new file mode 100644
index 00000000..f870fae1
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/head/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .gfocal_v2_tiny import GFocalHead_Tiny
+
+
+def build_head(cfg):
+
+    head_cfg = copy.deepcopy(cfg)
+    name = head_cfg.pop('name')
+    if name == 'GFocalV2':
+        return GFocalHead_Tiny(**head_cfg)
+    else:
+        raise NotImplementedError
diff --git a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
new file mode 100644
index 00000000..41f35968
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
@@ -0,0 +1,361 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import functools
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..core.base_ops import BaseConv, DWConv
+
+
+class Scale(nn.Module):
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+def multi_apply(func, *args, **kwargs):
+
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def xyxy2CxCywh(xyxy, size=None):
+    x1 = xyxy[..., 0]
+    y1 = xyxy[..., 1]
+    x2 = xyxy[..., 2]
+    y2 = xyxy[..., 3]
+
+    cx = (x1 + x2) / 2
+    cy = (y1 + y2) / 2
+
+    w = x2 - x1
+    h = y2 - y1
+    if size is not None:
+        w = w.clamp(min=0, max=size[1])
+        h = h.clamp(min=0, max=size[0])
+    return torch.stack([cx, cy, w, h], axis=-1)
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+        """
+        shape = x.size()
+        x = F.softmax(x.reshape(*shape[:-1], 4, self.reg_max + 1), dim=-1)
+        b, nb, ne, _ = x.size()
+        x = x.reshape(b * nb * ne, self.reg_max + 1)
+        y = self.project.type_as(x).unsqueeze(1)
+        x = torch.matmul(x, y).reshape(b, nb, 4)
+        return x
+
+
+class GFocalHead_Tiny(nn.Module):
+    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
+    Estimation for Dense Object Detection.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            stacked_convs=4,  # 4
+            feat_channels=256,
+            reg_max=12,
+            reg_topk=4,
+            reg_channels=64,
+            strides=[8, 16, 32],
+            add_mean=True,
+            norm='gn',
+            act='relu',
+            start_kernel_size=3,
+            conv_groups=1,
+            conv_type='BaseConv',
+            simOTA_cls_weight=1.0,
+            simOTA_iou_weight=3.0,
+            octbase=8,
+            simlqe=False,
+            **kwargs):
+        self.simlqe = simlqe
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.strides = strides
+        self.feat_channels = feat_channels if isinstance(feat_channels, list) \
+            else [feat_channels] * len(self.strides)
+
+        self.cls_out_channels = num_classes + 1  # add 1 for keep consistance with former models
+        # and will be deprecated in future.
+        self.stacked_convs = stacked_convs
+        self.conv_groups = conv_groups
+        self.reg_max = reg_max
+        self.reg_topk = reg_topk
+        self.reg_channels = reg_channels
+        self.add_mean = add_mean
+        self.total_dim = reg_topk
+        self.start_kernel_size = start_kernel_size
+
+        self.norm = norm
+        self.act = act
+        self.conv_module = DWConv if conv_type == 'DWConv' else BaseConv
+
+        if add_mean:
+            self.total_dim += 1
+
+        super(GFocalHead_Tiny, self).__init__()
+        self.integral = Integral(self.reg_max)
+
+        self._init_layers()
+
+    def _build_not_shared_convs(self, in_channel, feat_channels):
+        self.relu = nn.ReLU(inplace=True)
+        cls_convs = nn.ModuleList()
+        reg_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = feat_channels if i > 0 else in_channel
+            kernel_size = 3 if i > 0 else self.start_kernel_size
+            cls_convs.append(
+                self.conv_module(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=self.conv_groups,
+                    norm=self.norm,
+                    act=self.act))
+            reg_convs.append(
+                self.conv_module(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=self.conv_groups,
+                    norm=self.norm,
+                    act=self.act))
+        if not self.simlqe:
+            conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)]
+        else:
+            conf_vector = [
+                nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
+            ]
+        conf_vector += [self.relu]
+        conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
+        reg_conf = nn.Sequential(*conf_vector)
+
+        return cls_convs, reg_convs, reg_conf
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.reg_confs = nn.ModuleList()
+
+        for i in range(len(self.strides)):
+            cls_convs, reg_convs, reg_conf = self._build_not_shared_convs(
+                self.in_channels[i], self.feat_channels[i])
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+            self.reg_confs.append(reg_conf)
+
+        self.gfl_cls = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.gfl_reg = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def forward(self,
+                xin,
+                labels=None,
+                imgs=None,
+                conf_thre=0.05,
+                nms_thre=0.7):
+
+        # prepare labels during training
+        b, c, h, w = xin[0].shape
+        if labels is not None:
+            gt_bbox_list = []
+            gt_cls_list = []
+            for label in labels:
+                gt_bbox_list.append(label.bbox)
+                gt_cls_list.append((label.get_field('labels')
+                                    - 1).long())  # labels starts from 1
+
+        # prepare priors for label assignment and bbox decode
+        mlvl_priors_list = [
+            self.get_single_level_center_priors(
+                xin[i].shape[0],
+                xin[i].shape[-2:],
+                stride,
+                dtype=torch.float32,
+                device=xin[0].device) for i, stride in enumerate(self.strides)
+        ]
+        mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
+
+        # forward for bboxes and classification prediction
+        cls_scores, bbox_preds = multi_apply(
+            self.forward_single,
+            xin,
+            self.cls_convs,
+            self.reg_convs,
+            self.gfl_cls,
+            self.gfl_reg,
+            self.reg_confs,
+            self.scales,
+        )
+        flatten_cls_scores = torch.cat(cls_scores, dim=1)
+        flatten_bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        # calculating losses or bboxes decoded
+        if self.training:
+            loss = self.loss(flatten_cls_scores, flatten_bbox_preds,
+                             gt_bbox_list, gt_cls_list, mlvl_priors)
+            return loss
+        else:
+            output = self.get_bboxes(flatten_cls_scores, flatten_bbox_preds,
+                                     mlvl_priors)
+            return output
+
+    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg,
+                       reg_conf, scale):
+        """Forward feature of a single scale level.
+
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_conv in cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        bbox_pred = scale(gfl_reg(reg_feat)).float()
+        N, C, H, W = bbox_pred.size()
+        prob = F.softmax(
+            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
+        if not self.simlqe:
+            prob_topk, _ = prob.topk(self.reg_topk, dim=2)
+
+            if self.add_mean:
+                stat = torch.cat(
+                    [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2)
+            else:
+                stat = prob_topk
+
+            quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W))
+        else:
+            quality_score = reg_conf(
+                bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))
+
+        cls_score = gfl_cls(cls_feat).sigmoid() * quality_score
+
+        flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2)
+        flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2)
+        return flatten_cls_score, flatten_bbox_pred
+
+    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
+                                       dtype, device):
+
+        h, w = featmap_size
+        x_range = (torch.arange(0, int(w), dtype=dtype,
+                                device=device)) * stride
+        y_range = (torch.arange(0, int(h), dtype=dtype,
+                                device=device)) * stride
+
+        x = x_range.repeat(h, 1)
+        y = y_range.unsqueeze(-1).repeat(1, w)
+
+        y = y.flatten()
+        x = x.flatten()
+        strides = x.new_full((x.shape[0], ), stride)
+        priors = torch.stack([x, y, strides, strides], dim=-1)
+
+        return priors.unsqueeze(0).repeat(batch_size, 1, 1)
+
+    def sample(self, assign_result, gt_bboxes):
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.numel() == 0
+            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
+
+    def get_bboxes(self,
+                   cls_preds,
+                   reg_preds,
+                   mlvl_center_priors,
+                   img_meta=None):
+
+        dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None]
+        bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)
+
+        res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1)
+
+        return res
diff --git a/modelscope/models/cv/tinynas_detection/neck/__init__.py b/modelscope/models/cv/tinynas_detection/neck/__init__.py
new file mode 100644
index 00000000..3c418c29
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import copy
+
+from .giraffe_fpn import GiraffeNeck
+from .giraffe_fpn_v2 import GiraffeNeckV2
+
+
+def build_neck(cfg):
+    neck_cfg = copy.deepcopy(cfg)
+    name = neck_cfg.pop('name')
+    if name == 'GiraffeNeck':
+        return GiraffeNeck(**neck_cfg)
+    elif name == 'GiraffeNeckV2':
+        return GiraffeNeckV2(**neck_cfg)
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
new file mode 100644
index 00000000..289fdfd2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
@@ -0,0 +1,235 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import collections
+import itertools
+import os
+
+import networkx as nx
+from omegaconf import OmegaConf
+
+Node = collections.namedtuple('Node', ['id', 'inputs', 'type'])
+
+
+def get_graph_info(graph):
+    input_nodes = []
+    output_nodes = []
+    Nodes = []
+    for node in range(graph.number_of_nodes()):
+        tmp = list(graph.neighbors(node))
+        tmp.sort()
+        type = -1
+        if node < tmp[0]:
+            input_nodes.append(node)
+            type = 0
+        if node > tmp[-1]:
+            output_nodes.append(node)
+            type = 1
+        Nodes.append(Node(node, [n for n in tmp if n < node], type))
+    return Nodes, input_nodes, output_nodes
+
+
+def nodeid_trans(id, cur_level, num_levels):
+    if id % 2 == 1:
+        gap = int(((id + 1) // 2) * num_levels * 2)
+    else:
+        a = (num_levels - cur_level) * 2 - 1
+        b = ((id + 1) // 2) * num_levels * 2
+        gap = int(a + b)
+    return cur_level + gap
+
+
+def gen_log2n_graph_file(log2n_graph_file, depth_multiplier):
+    f = open(log2n_graph_file, 'w')
+    for i in range(depth_multiplier):
+        for j in [1, 2, 4, 8, 16, 32]:
+            if i - j < 0:
+                break
+            else:
+                f.write('%d,%d\n' % (i - j, i))
+    f.close()
+
+
+def get_log2n_graph(depth_multiplier):
+    nodes = []
+    connnections = []
+
+    for i in range(depth_multiplier):
+        nodes.append(i)
+        for j in [1, 2, 4, 8, 16, 32]:
+            if i - j < 0:
+                break
+            else:
+                connnections.append((i - j, i))
+    return nodes, connnections
+
+
+def get_dense_graph(depth_multiplier):
+    nodes = []
+    connections = []
+
+    for i in range(depth_multiplier):
+        nodes.append(i)
+        for j in range(i):
+            connections.append((j, i))
+    return nodes, connections
+
+
+def giraffeneck_config(min_level,
+                       max_level,
+                       weight_method=None,
+                       depth_multiplier=5,
+                       with_backslash=False,
+                       with_slash=False,
+                       with_skip_connect=False,
+                       skip_connect_type='dense'):
+    """Graph config with log2n merge and panet"""
+    if skip_connect_type == 'dense':
+        nodes, connections = get_dense_graph(depth_multiplier)
+    elif skip_connect_type == 'log2n':
+        nodes, connections = get_log2n_graph(depth_multiplier)
+    graph = nx.Graph()
+    graph.add_nodes_from(nodes)
+    graph.add_edges_from(connections)
+
+    drop_node = []
+    nodes, input_nodes, output_nodes = get_graph_info(graph)
+
+    weight_method = weight_method or 'fastattn'
+
+    num_levels = max_level - min_level + 1
+    node_ids = {min_level + i: [i] for i in range(num_levels)}
+    node_ids_per_layer = {}
+
+    pnodes = {}
+
+    def update_drop_node(new_id, input_offsets):
+        if new_id not in drop_node:
+            new_id = new_id
+        else:
+            while new_id in drop_node:
+                if new_id in pnodes:
+                    for n in pnodes[new_id]['inputs_offsets']:
+                        if n not in input_offsets and n not in drop_node:
+                            input_offsets.append(n)
+                new_id = new_id - 1
+        if new_id not in input_offsets:
+            input_offsets.append(new_id)
+
+    # top-down layer
+    for i in range(max_level, min_level - 1, -1):
+        node_ids_per_layer[i] = []
+        for id, node in enumerate(nodes):
+            input_offsets = []
+            if id in input_nodes:
+                input_offsets.append(node_ids[i][0])
+            else:
+                if with_skip_connect:
+                    for input_id in node.inputs:
+                        new_id = nodeid_trans(input_id, i - min_level,
+                                              num_levels)
+                        update_drop_node(new_id, input_offsets)
+
+            # add top2down
+            new_id = nodeid_trans(id, i - min_level, num_levels)
+
+            # add backslash node
+            def cal_backslash_node(id):
+                ind = id // num_levels
+                mod = id % num_levels
+                if ind % 2 == 0:  # even
+                    if mod == (num_levels - 1):
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod - 1)
+                else:  # odd
+                    if mod == 0:
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod + 1)
+
+                return last
+
+            # add slash node
+            def cal_slash_node(id):
+                ind = id // num_levels
+                mod = id % num_levels
+                if ind % 2 == 1:  # odd
+                    if mod == (num_levels - 1):
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod - 1)
+                else:  # even
+                    if mod == 0:
+                        last = -1
+                    else:
+                        last = (ind - 1) * num_levels + (
+                            num_levels - 1 - mod + 1)
+
+                return last
+
+            # add last node
+            last = new_id - 1
+            update_drop_node(last, input_offsets)
+
+            if with_backslash:
+                backslash = cal_backslash_node(new_id)
+                if backslash != -1 and backslash not in input_offsets:
+                    input_offsets.append(backslash)
+
+            if with_slash:
+                slash = cal_slash_node(new_id)
+                if slash != -1 and slash not in input_offsets:
+                    input_offsets.append(slash)
+
+            if new_id in drop_node:
+                input_offsets = []
+
+            pnodes[new_id] = {
+                'reduction': 1 << i,
+                'inputs_offsets': input_offsets,
+                'weight_method': weight_method,
+                'is_out': 0,
+            }
+
+        input_offsets = []
+        for out_id in output_nodes:
+            new_id = nodeid_trans(out_id, i - min_level, num_levels)
+            input_offsets.append(new_id)
+
+        pnodes[node_ids[i][0] + num_levels * (len(nodes) + 1)] = {
+            'reduction': 1 << i,
+            'inputs_offsets': input_offsets,
+            'weight_method': weight_method,
+            'is_out': 1,
+        }
+
+    pnodes = dict(sorted(pnodes.items(), key=lambda x: x[0]))
+    return pnodes
+
+
+def get_graph_config(fpn_name,
+                     min_level=3,
+                     max_level=7,
+                     weight_method='concat',
+                     depth_multiplier=5,
+                     with_backslash=False,
+                     with_slash=False,
+                     with_skip_connect=False,
+                     skip_connect_type='dense'):
+    name_to_config = {
+        'giraffeneck':
+        giraffeneck_config(
+            min_level=min_level,
+            max_level=max_level,
+            weight_method=weight_method,
+            depth_multiplier=depth_multiplier,
+            with_backslash=with_backslash,
+            with_slash=with_slash,
+            with_skip_connect=with_skip_connect,
+            skip_connect_type=skip_connect_type),
+    }
+    return name_to_config[fpn_name]
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
new file mode 100644
index 00000000..b7087779
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
@@ -0,0 +1,661 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import logging
+import math
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm import create_model
+from timm.models.layers import (Swish, create_conv2d, create_pool2d,
+                                get_act_layer)
+
+from ..core.base_ops import CSPLayer, ShuffleBlock, ShuffleCSPLayer
+from .giraffe_config import get_graph_config
+
+_ACT_LAYER = Swish
+
+
+class SequentialList(nn.Sequential):
+    """ This module exists to work around torchscript typing issues list -> list"""
+
+    def __init__(self, *args):
+        super(SequentialList, self).__init__(*args)
+
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
+        for module in self:
+            x = module(x)
+        return x
+
+
+class ConvBnAct2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 padding='',
+                 bias=False,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER):
+        super(ConvBnAct2d, self).__init__()
+
+        self.conv = create_conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            bias=bias)
+        self.bn = None if norm_layer is None else norm_layer(out_channels)
+        self.act = None if act_layer is None else act_layer(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class SeparableConv2d(nn.Module):
+    """ Separable Conv
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilation=1,
+                 padding='',
+                 bias=False,
+                 channel_multiplier=1.0,
+                 pw_kernel_size=1,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER):
+        super(SeparableConv2d, self).__init__()
+        self.conv_dw = create_conv2d(
+            in_channels,
+            int(in_channels * channel_multiplier),
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier),
+            out_channels,
+            pw_kernel_size,
+            padding=padding,
+            bias=bias)
+
+        self.bn = None if norm_layer is None else norm_layer(out_channels)
+        self.act = None if act_layer is None else act_layer(inplace=True)
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def _init_weight(
+    m,
+    n='',
+):
+    """ Weight initialization as per Tensorflow official implementations.
+    """
+
+    def _fan_in_out(w, groups=1):
+        dimensions = w.dim()
+        if dimensions < 2:
+            raise ValueError(
+                'Fan in and fan out can not be computed for tensor with fewer than 2 dimensions'
+            )
+        num_input_fmaps = w.size(1)
+        num_output_fmaps = w.size(0)
+        receptive_field_size = 1
+        if w.dim() > 2:
+            receptive_field_size = w[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+        fan_out //= groups
+        return fan_in, fan_out
+
+    def _glorot_uniform(w, gain=1, groups=1):
+        fan_in, fan_out = _fan_in_out(w, groups)
+        gain /= max(1., (fan_in + fan_out) / 2.)  # fan avg
+        limit = math.sqrt(3.0 * gain)
+        w.data.uniform_(-limit, limit)
+
+    def _variance_scaling(w, gain=1, groups=1):
+        fan_in, fan_out = _fan_in_out(w, groups)
+        gain /= max(1., fan_in)  # fan in
+        std = math.sqrt(gain)
+        w.data.normal_(std=std)
+
+    if isinstance(m, SeparableConv2d):
+        if 'box_net' in n or 'class_net' in n:
+            _variance_scaling(m.conv_dw.weight, groups=m.conv_dw.groups)
+            _variance_scaling(m.conv_pw.weight)
+            if m.conv_pw.bias is not None:
+                if 'class_net.predict' in n:
+                    m.conv_pw.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+                else:
+                    m.conv_pw.bias.data.zero_()
+        else:
+            _glorot_uniform(m.conv_dw.weight, groups=m.conv_dw.groups)
+            _glorot_uniform(m.conv_pw.weight)
+            if m.conv_pw.bias is not None:
+                m.conv_pw.bias.data.zero_()
+    elif isinstance(m, ConvBnAct2d):
+        if 'box_net' in n or 'class_net' in n:
+            m.conv.weight.data.normal_(std=.01)
+            if m.conv.bias is not None:
+                if 'class_net.predict' in n:
+                    m.conv.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+                else:
+                    m.conv.bias.data.zero_()
+        else:
+            _glorot_uniform(m.conv.weight)
+            if m.conv.bias is not None:
+                m.conv.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+
+
+def _init_weight_alt(
+    m,
+    n='',
+):
+    """ Weight initialization alternative, based on EfficientNet bacbkone init w/ class bias addition
+    NOTE: this will likely be removed after some experimentation
+    """
+    if isinstance(m, nn.Conv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        fan_out //= m.groups
+        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if m.bias is not None:
+            if 'class_net.predict' in n:
+                m.bias.data.fill_(-math.log((1 - 0.01) / 0.01))
+            else:
+                m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+
+
+class Interpolate2d(nn.Module):
+    r"""Resamples a 2d Image
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear,
+    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
+    respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is
+            ``'linear'``, ``'bilinear'``, or ``'trilinear'``. Default: ``False``
+    """
+    __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name']
+    name: str
+    size: Optional[Union[int, Tuple[int, int]]]
+    scale_factor: Optional[Union[float, Tuple[float, float]]]
+    mode: str
+    align_corners: Optional[bool]
+
+    def __init__(self,
+                 size: Optional[Union[int, Tuple[int, int]]] = None,
+                 scale_factor: Optional[Union[float, Tuple[float,
+                                                           float]]] = None,
+                 mode: str = 'nearest',
+                 align_corners: bool = False) -> None:
+        super(Interpolate2d, self).__init__()
+        self.name = type(self).__name__
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = None if mode == 'nearest' else align_corners
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.interpolate(
+            input,
+            self.size,
+            self.scale_factor,
+            self.mode,
+            self.align_corners,
+            recompute_scale_factor=False)
+
+
+class ResampleFeatureMap(nn.Sequential):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 reduction_ratio=1.,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 apply_bn=False,
+                 conv_after_downsample=False,
+                 redundant_bias=False):
+        super(ResampleFeatureMap, self).__init__()
+        downsample = downsample or 'max'
+        upsample = upsample or 'nearest'
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.reduction_ratio = reduction_ratio
+        self.conv_after_downsample = conv_after_downsample
+
+        conv = None
+        if in_channels != out_channels:
+            conv = ConvBnAct2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                padding=pad_type,
+                norm_layer=norm_layer if apply_bn else None,
+                bias=not apply_bn or redundant_bias,
+                act_layer=None)
+
+        if reduction_ratio > 1:
+            if conv is not None and not self.conv_after_downsample:
+                self.add_module('conv', conv)
+            if downsample in ('max', 'avg'):
+                stride_size = int(reduction_ratio)
+                downsample = create_pool2d(
+                    downsample,
+                    kernel_size=stride_size + 1,
+                    stride=stride_size,
+                    padding=pad_type)
+            else:
+                downsample = Interpolate2d(
+                    scale_factor=1. / reduction_ratio, mode=downsample)
+            self.add_module('downsample', downsample)
+            if conv is not None and self.conv_after_downsample:
+                self.add_module('conv', conv)
+        else:
+            if conv is not None:
+                self.add_module('conv', conv)
+            if reduction_ratio < 1:
+                scale = int(1 // reduction_ratio)
+                self.add_module(
+                    'upsample',
+                    Interpolate2d(scale_factor=scale, mode=upsample))
+
+
+class GiraffeCombine(nn.Module):
+
+    def __init__(self,
+                 feature_info,
+                 fpn_config,
+                 fpn_channels,
+                 inputs_offsets,
+                 target_reduction,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 apply_resample_bn=False,
+                 conv_after_downsample=False,
+                 redundant_bias=False,
+                 weight_method='attn'):
+        super(GiraffeCombine, self).__init__()
+        self.inputs_offsets = inputs_offsets
+        self.weight_method = weight_method
+
+        self.resample = nn.ModuleDict()
+        reduction_base = feature_info[0]['reduction']
+
+        target_channels_idx = int(
+            math.log(target_reduction // reduction_base, 2))
+        for idx, offset in enumerate(inputs_offsets):
+            if offset < len(feature_info):
+                in_channels = feature_info[offset]['num_chs']
+                input_reduction = feature_info[offset]['reduction']
+            else:
+                node_idx = offset
+                input_reduction = fpn_config[node_idx]['reduction']
+                # in_channels = fpn_config[node_idx]['num_chs']
+                input_channels_idx = int(
+                    math.log(input_reduction // reduction_base, 2))
+                in_channels = feature_info[input_channels_idx]['num_chs']
+
+            reduction_ratio = target_reduction / input_reduction
+            if weight_method == 'concat':
+                self.resample[str(offset)] = ResampleFeatureMap(
+                    in_channels,
+                    in_channels,
+                    reduction_ratio=reduction_ratio,
+                    pad_type=pad_type,
+                    downsample=downsample,
+                    upsample=upsample,
+                    norm_layer=norm_layer,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias)
+            else:
+                self.resample[str(offset)] = ResampleFeatureMap(
+                    in_channels,
+                    fpn_channels[target_channels_idx],
+                    reduction_ratio=reduction_ratio,
+                    pad_type=pad_type,
+                    downsample=downsample,
+                    upsample=upsample,
+                    norm_layer=norm_layer,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias)
+
+        if weight_method == 'attn' or weight_method == 'fastattn':
+            self.edge_weights = nn.Parameter(
+                torch.ones(len(inputs_offsets)), requires_grad=True)  # WSM
+        else:
+            self.edge_weights = None
+
+    def forward(self, x: List[torch.Tensor]):
+        dtype = x[0].dtype
+        nodes = []
+        if len(self.inputs_offsets) == 0:
+            return None
+        for offset, resample in zip(self.inputs_offsets,
+                                    self.resample.values()):
+            input_node = x[offset]
+            input_node = resample(input_node)
+            nodes.append(input_node)
+
+        if self.weight_method == 'attn':
+            normalized_weights = torch.softmax(
+                self.edge_weights.to(dtype=dtype), dim=0)
+            out = torch.stack(nodes, dim=-1) * normalized_weights
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'fastattn':
+            edge_weights = nn.functional.relu(
+                self.edge_weights.to(dtype=dtype))
+            weights_sum = torch.sum(edge_weights)
+            weights_norm = weights_sum + 0.0001
+            out = torch.stack([(nodes[i] * edge_weights[i]) / weights_norm
+                               for i in range(len(nodes))],
+                              dim=-1)
+
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'sum':
+            out = torch.stack(nodes, dim=-1)
+            out = torch.sum(out, dim=-1)
+        elif self.weight_method == 'concat':
+            out = torch.cat(nodes, dim=1)
+        else:
+            raise ValueError('unknown weight_method {}'.format(
+                self.weight_method))
+        return out
+
+
+class GiraffeNode(nn.Module):
+    """ A simple wrapper used in place of nn.Sequential for torchscript typing
+    Handles input type List[Tensor] -> output type Tensor
+    """
+
+    def __init__(self, combine: nn.Module, after_combine: nn.Module):
+        super(GiraffeNode, self).__init__()
+        self.combine = combine
+        self.after_combine = after_combine
+
+    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+        combine_feat = self.combine(x)
+        if combine_feat is None:
+            return None
+        else:
+            return self.after_combine(combine_feat)
+
+
+class GiraffeLayer(nn.Module):
+
+    def __init__(self,
+                 feature_info,
+                 fpn_config,
+                 inner_fpn_channels,
+                 outer_fpn_channels,
+                 num_levels=5,
+                 pad_type='',
+                 downsample=None,
+                 upsample=None,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=_ACT_LAYER,
+                 apply_resample_bn=False,
+                 conv_after_downsample=True,
+                 conv_bn_relu_pattern=False,
+                 separable_conv=True,
+                 redundant_bias=False,
+                 merge_type='conv'):
+        super(GiraffeLayer, self).__init__()
+        self.num_levels = num_levels
+        self.conv_bn_relu_pattern = False
+
+        self.feature_info = {}
+        for idx, feat in enumerate(feature_info):
+            self.feature_info[idx] = feat
+
+        self.fnode = nn.ModuleList()
+        reduction_base = feature_info[0]['reduction']
+        for i, fnode_cfg in fpn_config.items():
+            logging.debug('fnode {} : {}'.format(i, fnode_cfg))
+
+            if fnode_cfg['is_out'] == 1:
+                fpn_channels = outer_fpn_channels
+            else:
+                fpn_channels = inner_fpn_channels
+
+            reduction = fnode_cfg['reduction']
+            fpn_channels_idx = int(math.log(reduction // reduction_base, 2))
+            combine = GiraffeCombine(
+                self.feature_info,
+                fpn_config,
+                fpn_channels,
+                tuple(fnode_cfg['inputs_offsets']),
+                target_reduction=reduction,
+                pad_type=pad_type,
+                downsample=downsample,
+                upsample=upsample,
+                norm_layer=norm_layer,
+                apply_resample_bn=apply_resample_bn,
+                conv_after_downsample=conv_after_downsample,
+                redundant_bias=redundant_bias,
+                weight_method=fnode_cfg['weight_method'])
+
+            after_combine = nn.Sequential()
+
+            in_channels = 0
+            out_channels = 0
+            for input_offset in fnode_cfg['inputs_offsets']:
+                in_channels += self.feature_info[input_offset]['num_chs']
+
+            out_channels = fpn_channels[fpn_channels_idx]
+
+            if merge_type == 'csp':
+                after_combine.add_module(
+                    'CspLayer',
+                    CSPLayer(
+                        in_channels,
+                        out_channels,
+                        2,
+                        shortcut=True,
+                        depthwise=False,
+                        act='silu'))
+            elif merge_type == 'shuffle':
+                after_combine.add_module(
+                    'shuffleBlock', ShuffleBlock(in_channels, in_channels))
+                after_combine.add_module(
+                    'conv1x1',
+                    create_conv2d(in_channels, out_channels, kernel_size=1))
+            elif merge_type == 'conv':
+                after_combine.add_module(
+                    'conv1x1',
+                    create_conv2d(in_channels, out_channels, kernel_size=1))
+                conv_kwargs = dict(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=pad_type,
+                    bias=False,
+                    norm_layer=norm_layer,
+                    act_layer=act_layer)
+                if not conv_bn_relu_pattern:
+                    conv_kwargs['bias'] = redundant_bias
+                    conv_kwargs['act_layer'] = None
+                    after_combine.add_module('act', act_layer(inplace=True))
+                after_combine.add_module(
+                    'conv',
+                    SeparableConv2d(**conv_kwargs)
+                    if separable_conv else ConvBnAct2d(**conv_kwargs))
+
+            self.fnode.append(
+                GiraffeNode(combine=combine, after_combine=after_combine))
+            self.feature_info[i] = dict(
+                num_chs=fpn_channels[fpn_channels_idx], reduction=reduction)
+
+        self.out_feature_info = []
+        out_node = list(self.feature_info.keys())[-num_levels::]
+        for i in out_node:
+            self.out_feature_info.append(self.feature_info[i])
+
+        self.feature_info = self.out_feature_info
+
+    def forward(self, x: List[torch.Tensor]):
+        for fn in self.fnode:
+            x.append(fn(x))
+        return x[-self.num_levels::]
+
+
+class GiraffeNeck(nn.Module):
+
+    def __init__(self, min_level, max_level, num_levels, norm_layer,
+                 norm_kwargs, act_type, fpn_config, fpn_name, fpn_channels,
+                 out_fpn_channels, weight_method, depth_multiplier,
+                 width_multiplier, with_backslash, with_slash,
+                 with_skip_connect, skip_connect_type, separable_conv,
+                 feature_info, merge_type, pad_type, downsample_type,
+                 upsample_type, apply_resample_bn, conv_after_downsample,
+                 redundant_bias, conv_bn_relu_pattern, alternate_init):
+        super(GiraffeNeck, self).__init__()
+
+        self.num_levels = num_levels
+        self.min_level = min_level
+        self.in_features = [0, 1, 2, 3, 4, 5,
+                            6][self.min_level - 1:self.min_level - 1
+                               + num_levels]
+        self.alternate_init = alternate_init
+        norm_layer = norm_layer or nn.BatchNorm2d
+        if norm_kwargs:
+            norm_layer = partial(norm_layer, **norm_kwargs)
+        act_layer = get_act_layer(act_type) or _ACT_LAYER
+        fpn_config = fpn_config or get_graph_config(
+            fpn_name,
+            min_level=min_level,
+            max_level=max_level,
+            weight_method=weight_method,
+            depth_multiplier=depth_multiplier,
+            with_backslash=with_backslash,
+            with_slash=with_slash,
+            with_skip_connect=with_skip_connect,
+            skip_connect_type=skip_connect_type)
+
+        # width scale
+        for i in range(len(fpn_channels)):
+            fpn_channels[i] = int(fpn_channels[i] * width_multiplier)
+
+        self.resample = nn.ModuleDict()
+        for level in range(num_levels):
+            if level < len(feature_info):
+                in_chs = feature_info[level]['num_chs']
+                reduction = feature_info[level]['reduction']
+            else:
+                # Adds a coarser level by downsampling the last feature map
+                reduction_ratio = 2
+                self.resample[str(level)] = ResampleFeatureMap(
+                    in_channels=in_chs,
+                    out_channels=feature_info[level - 1]['num_chs'],
+                    pad_type=pad_type,
+                    downsample=downsample_type,
+                    upsample=upsample_type,
+                    norm_layer=norm_layer,
+                    reduction_ratio=reduction_ratio,
+                    apply_bn=apply_resample_bn,
+                    conv_after_downsample=conv_after_downsample,
+                    redundant_bias=redundant_bias,
+                )
+                in_chs = feature_info[level - 1]['num_chs']
+                reduction = int(reduction * reduction_ratio)
+                feature_info.append(dict(num_chs=in_chs, reduction=reduction))
+
+        self.cell = SequentialList()
+        logging.debug('building giraffeNeck')
+        giraffe_layer = GiraffeLayer(
+            feature_info=feature_info,
+            fpn_config=fpn_config,
+            inner_fpn_channels=fpn_channels,
+            outer_fpn_channels=out_fpn_channels,
+            num_levels=num_levels,
+            pad_type=pad_type,
+            downsample=downsample_type,
+            upsample=upsample_type,
+            norm_layer=norm_layer,
+            act_layer=act_layer,
+            separable_conv=separable_conv,
+            apply_resample_bn=apply_resample_bn,
+            conv_after_downsample=conv_after_downsample,
+            conv_bn_relu_pattern=conv_bn_relu_pattern,
+            redundant_bias=redundant_bias,
+            merge_type=merge_type)
+        self.cell.add_module('giraffeNeck', giraffe_layer)
+        feature_info = giraffe_layer.feature_info
+
+    def init_weights(self, pretrained=False):
+        for n, m in self.named_modules():
+            if 'backbone' not in n:
+                if self.alternate_init:
+                    _init_weight_alt(m, n)
+                else:
+                    _init_weight(m, n)
+
+    def forward(self, x: List[torch.Tensor]):
+        if type(x) is tuple:
+            x = list(x)
+        x = [x[f] for f in self.in_features]
+        for resample in self.resample.values():
+            x.append(resample(x[-1]))
+        x = self.cell(x)
+        return x
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
new file mode 100644
index 00000000..b710572f
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
@@ -0,0 +1,203 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import torch
+import torch.nn as nn
+
+from ..core.base_ops import BaseConv, CSPLayer, DWConv
+from ..core.neck_ops import CSPStage
+
+
+class GiraffeNeckV2(nn.Module):
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=[2, 3, 4],
+        in_channels=[256, 512, 1024],
+        out_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+        spp=True,
+        reparam_mode=True,
+        block_name='BasicBlock',
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        reparam_mode = reparam_mode
+
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+        # node x3: input x0, x1
+        self.bu_conv13 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_3 = CSPStage(
+                block_name,
+                int((in_channels[1] + in_channels[2]) * width),
+                int(in_channels[2] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_3 = CSPLayer(
+                int((in_channels[1] + in_channels[2]) * width),
+                int(in_channels[2] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x4: input x1, x2, x3
+        self.bu_conv24 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_4 = CSPStage(
+                block_name,
+                int((in_channels[0] + in_channels[1] + in_channels[2])
+                    * width),
+                int(in_channels[1] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_4 = CSPLayer(
+                int((in_channels[0] + in_channels[1] + in_channels[2])
+                    * width),
+                int(in_channels[1] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x5: input x2, x4
+        if reparam_mode:
+            self.merge_5 = CSPStage(
+                block_name,
+                int((in_channels[1] + in_channels[0]) * width),
+                int(out_channels[0] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_5 = CSPLayer(
+                int((in_channels[1] + in_channels[0]) * width),
+                int(out_channels[0] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x7: input x4, x5
+        self.bu_conv57 = Conv(
+            int(out_channels[0] * width),
+            int(out_channels[0] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_7 = CSPStage(
+                block_name,
+                int((out_channels[0] + in_channels[1]) * width),
+                int(out_channels[1] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_7 = CSPLayer(
+                int((out_channels[0] + in_channels[1]) * width),
+                int(out_channels[1] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+        # node x6: input x3, x4, x7
+        self.bu_conv46 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.bu_conv76 = Conv(
+            int(out_channels[1] * width),
+            int(out_channels[1] * width),
+            3,
+            2,
+            act=act)
+        if reparam_mode:
+            self.merge_6 = CSPStage(
+                block_name,
+                int((in_channels[1] + out_channels[1] + in_channels[2])
+                    * width),
+                int(out_channels[2] * width),
+                round(3 * depth),
+                act=act,
+                spp=spp)
+        else:
+            self.merge_6 = CSPLayer(
+                int((in_channels[1] + out_channels[1] + in_channels[2])
+                    * width),
+                int(out_channels[2] * width),
+                round(3 * depth),
+                False,
+                depthwise=depthwise,
+                act=act)
+
+    def init_weights(self):
+        pass
+
+    def forward(self, out_features):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        # node x3
+        x13 = self.bu_conv13(x1)
+        x3 = torch.cat([x0, x13], 1)
+        x3 = self.merge_3(x3)
+
+        # node x4
+        x34 = self.upsample(x3)
+        x24 = self.bu_conv24(x2)
+        x4 = torch.cat([x1, x24, x34], 1)
+        x4 = self.merge_4(x4)
+
+        # node x5
+        x45 = self.upsample(x4)
+        x5 = torch.cat([x2, x45], 1)
+        x5 = self.merge_5(x5)
+
+        # node x7
+        x57 = self.bu_conv57(x5)
+        x7 = torch.cat([x4, x57], 1)
+        x7 = self.merge_7(x7)
+
+        # node x6
+        x46 = self.bu_conv46(x4)
+        x76 = self.bu_conv76(x7)
+        x6 = torch.cat([x3, x46, x76], 1)
+        x6 = self.merge_6(x6)
+
+        outputs = (x5, x7, x6)
+        return outputs
diff --git a/modelscope/models/cv/tinynas_detection/tinynas_detector.py b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
new file mode 100644
index 00000000..e6f144df
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .detector import SingleStageDetector
+
+
+@MODELS.register_module(
+    Tasks.image_object_detection, module_name=Models.tinynas_detection)
+class TinynasDetector(SingleStageDetector):
+
+    def __init__(self, model_dir, *args, **kwargs):
+
+        super(TinynasDetector, self).__init__(model_dir, *args, **kwargs)
diff --git a/modelscope/models/cv/tinynas_detection/utils.py b/modelscope/models/cv/tinynas_detection/utils.py
new file mode 100644
index 00000000..d67d3a36
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# The AIRDet implementation is also open-sourced by the authors, and available at https://github.com/tinyvision/AIRDet.
+
+import importlib
+import os
+import sys
+from os.path import dirname, join
+
+
+def get_config_by_file(config_file):
+    try:
+        sys.path.append(os.path.dirname(config_file))
+        current_config = importlib.import_module(
+            os.path.basename(config_file).split('.')[0])
+        exp = current_config.Config()
+    except Exception:
+        raise ImportError(
+            "{} doesn't contains class named 'Config'".format(config_file))
+    return exp
+
+
+def parse_config(config_file):
+    """
+    get config object by file.
+    Args:
+        config_file (str): file path of config.
+    """
+    assert (config_file is not None), 'plz provide config file'
+    if config_file is not None:
+        return get_config_by_file(config_file)
diff --git a/modelscope/pipelines/cv/tinynas_detection_pipeline.py b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
new file mode 100644
index 00000000..b2063629
--- /dev/null
+++ b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_object_detection, module_name=Pipelines.tinynas_detection)
+class TinynasDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.model.to(self.device)
+        self.model.eval()
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        self.img = img
+        img = img.astype(np.float)
+        img = self.model.preprocess(img)
+        result = {'img': img.to(self.device)}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {'data': outputs}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        bboxes, scores, labels = self.model.postprocess(inputs['data'])
+        if bboxes is None:
+            return None
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+        return outputs
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
new file mode 100644
index 00000000..6b2ecd0b
--- /dev/null
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -0,0 +1,20 @@
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TinynasObjectDetectionTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        tinynas_object_detection = pipeline(
+            Tasks.image_object_detection, model='damo/cv_tinynas_detection')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1a22fa02228f0884bcb48bdaccc4f90a24c85009 Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Fri, 2 Sep 2022 14:06:08 +0800
Subject: [PATCH 07/28] fix trainer unittest         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9970626

    * fix trainer unittest
---
 tests/trainers/test_trainer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 17fa97f9..86909f74 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -17,7 +17,7 @@ from modelscope.metrics.builder import MetricKeys
 from modelscope.models.base import Model
 from modelscope.trainers import build_trainer
 from modelscope.trainers.base import DummyTrainer
-from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
+from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile, Tasks
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
 
 
@@ -67,6 +67,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_0(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir':
                 self.tmp_dir,
@@ -141,6 +142,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_1(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir':
                 self.tmp_dir,
@@ -201,6 +203,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_with_default_config(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir': self.tmp_dir,
                 'dataloader': {
@@ -319,6 +322,7 @@ class TrainerTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_train_with_iters_per_epoch(self):
         json_cfg = {
+            'task': Tasks.image_classification,
             'train': {
                 'work_dir': self.tmp_dir,
                 'dataloader': {

From 4d3716cf4ebd0efc814818709234f93eef8e73c5 Mon Sep 17 00:00:00 2001
From: "xingguang.zxg" <xingguang.zxg@alibaba-inc.com>
Date: Fri, 2 Sep 2022 14:14:47 +0800
Subject: [PATCH 08/28] =?UTF-8?q?[to=20#42322933]=E6=96=87=E6=9C=AC?=
 =?UTF-8?q?=E6=8C=87=E5=AF=BC=E7=9A=84=E8=AF=AD=E4=B9=89=E5=88=86=E5=89=B2?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

文本指导的语义分割模型，根据输入的文本信息，讲图像中对应文本描述的物体分割出来。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9942863
---
 data/test/images/text_driven_segmentation.jpg |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/text_driven_segmentation/__init__.py   |   1 +
 .../cv/text_driven_segmentation/clip.py       | 170 ++++++
 .../cv/text_driven_segmentation/lseg_base.py  |  28 +
 .../text_driven_segmentation/lseg_blocks.py   | 334 +++++++++++
 .../cv/text_driven_segmentation/lseg_model.py | 107 ++++
 .../cv/text_driven_segmentation/lseg_net.py   | 197 +++++++
 .../cv/text_driven_segmentation/lseg_vit.py   | 543 ++++++++++++++++++
 .../cv/text_driven_segmentation/model.py      | 458 +++++++++++++++
 .../simple_tokenizer.py                       | 156 +++++
 modelscope/outputs.py                         |   7 +
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/cv/__init__.py           |   3 +
 .../cv/text_driven_segmentation_pipleline.py  |  51 ++
 modelscope/utils/constant.py                  |   1 +
 .../test_text_driven_segmentation.py          |  28 +
 17 files changed, 2092 insertions(+)
 create mode 100644 data/test/images/text_driven_segmentation.jpg
 create mode 100644 modelscope/models/cv/text_driven_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/clip.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_base.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_model.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_net.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/lseg_vit.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/model.py
 create mode 100644 modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
 create mode 100644 modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
 create mode 100644 tests/pipelines/test_text_driven_segmentation.py

diff --git a/data/test/images/text_driven_segmentation.jpg b/data/test/images/text_driven_segmentation.jpg
new file mode 100644
index 00000000..e3320b1f
--- /dev/null
+++ b/data/test/images/text_driven_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c7d2f279e3b317f1d0de18410a0585e122166fa2464c17b88a0c813f6c58bd4
+size 67861
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index fd653bac..3225710a 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -29,6 +29,7 @@ class Models(object):
     video_summarization = 'pgl-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
+    text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
 
     # EasyCV models
@@ -143,6 +144,7 @@ class Pipelines(object):
     video_summarization = 'googlenet_pgl_video_summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
+    text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
 
     # nlp tasks
diff --git a/modelscope/models/cv/text_driven_segmentation/__init__.py b/modelscope/models/cv/text_driven_segmentation/__init__.py
new file mode 100644
index 00000000..46daad78
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/__init__.py
@@ -0,0 +1 @@
+from .lseg_base import TextDrivenSegmentation
diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py
new file mode 100644
index 00000000..440cccea
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -0,0 +1,170 @@
+""" CLIP
+Adapted from https://github.com/openai/CLIP.
+Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, List, Union
+
+import torch
+from PIL import Image
+from pkg_resources import packaging
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
+from tqdm import tqdm
+
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+if packaging.version.parse(
+        torch.__version__) < packaging.version.parse('1.7.1'):
+    warnings.warn('PyTorch version 1.7.1 or higher is recommended')
+__all__ = ['load', 'tokenize']
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
+def load(name: str,
+         device: Union[str, torch.device] = 'cuda'
+         if torch.cuda.is_available() else 'cpu',
+         jit: bool = False,
+         root: str = None):
+
+    if not jit:
+        model = build_model().to(device)
+        if str(device) == 'cpu':
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+        except RuntimeError:
+            graphs = []
+
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        node['value']).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, 'graph') else []
+            except RuntimeError:
+                graphs = []
+
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [
+                            1, 2
+                    ]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()['value'] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, _transform(model.input_resolution.item())
+
+
+def tokenize(
+        _tokenizer,
+        texts: Union[str, List[str]],
+        context_length: int = 77,
+        truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+    """
+    Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
+    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<|startoftext|>']
+    eot_token = _tokenizer.encoder['<|endoftext|>']
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    if packaging.version.parse(
+            torch.__version__) < packaging.version.parse('1.8.0'):
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    else:
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f'Input {texts[i]} is too long for context length {context_length}'
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_base.py b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
new file mode 100644
index 00000000..20915396
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_base.py
@@ -0,0 +1,28 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import torch
+import torch.nn as nn
+
+from .lseg_net import LSeg
+
+
+class TextDrivenSegmentation(nn.Module):
+
+    def __init__(self, model_dir):
+        super(TextDrivenSegmentation, self).__init__()
+        self.net = LSeg(model_dir=model_dir)
+        self.model_dir = model_dir
+
+    def forward(self, img, txt_list):
+        b = img.size()[0]
+        batch_name_list = txt_list
+        xout_list = []
+        for i in range(b):
+            labelset = ['others', batch_name_list[i]]
+            xout = self.net(img[i:i + 1], labelset=labelset)
+            xout_list.append(xout)
+        score_map = torch.cat(xout_list, dim=0)
+        return score_map
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
new file mode 100644
index 00000000..cb550ab7
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_blocks.py
@@ -0,0 +1,334 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import torch
+import torch.nn as nn
+
+from .lseg_vit import _make_pretrained_clip_vitl16_384, forward_vit
+
+
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained=True,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout='ignore',
+    enable_attention_hooks=False,
+):
+    if backbone == 'clip_vitl16_384':
+        clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch([256, 512, 1024, 1024],
+                                features,
+                                groups=groups,
+                                expand=expand)
+    else:
+        raise NotImplementedError(f"Backbone '{backbone}' not implemented")
+
+    return clip_pretrained, pretrained, scratch
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand is True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+
+    return scratch
+
+
+class Interpolate(nn.Module):
+    """Interpolation module."""
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+        return x
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        return out + x
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode='bilinear', align_corners=True)
+
+        return output
+
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+    ):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        output = self.out_conv(output)
+        return output
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
new file mode 100644
index 00000000..1d7ebdd1
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -0,0 +1,107 @@
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.text_driven_segmentation import \
+    TextDrivenSegmentation
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['TextDrivenSeg']
+
+
+@MODELS.register_module(
+    Tasks.text_driven_segmentation,
+    module_name=Models.text_driven_segmentation)
+class TextDrivenSeg(TorchModel):
+    """ text driven segmentation model.
+    """
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.model = TextDrivenSegmentation(model_dir=model_dir)
+        pretrained_params = torch.load('{}/{}'.format(
+            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+        self.model.load_state_dict(pretrained_params)
+        self.model.eval()
+        if device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(device_id))
+            logger.info('Use GPU: {}'.format(device_id))
+        else:
+            device_id = -1
+            logger.info('Use CPU for inference')
+        self.device_id = device_id
+
+    def preprocess(self, img, size=640):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        h, w, c = img.shape
+        max_hw = max(h, w)
+        ratio = 1.0 * size / max_hw
+        crop_h, crop_w = int(ratio * h), int(ratio * w)
+        pil_img = Image.fromarray(img)
+        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
+        np_img = np.array(pil_img, dtype=np.float32) / 255.
+        for j in range(3):
+            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
+        img_pad = np.zeros((size, size, 3), dtype=np.float32)
+        img_pad[:crop_h, :crop_w] = np_img
+        img_pad = torch.from_numpy(img_pad).permute(2, 0,
+                                                    1).unsqueeze(0).float()
+        return img_pad, h, w, crop_h, crop_w
+
+    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
+        output = np.clip(tensors * 255., a_min=0, a_max=255.)
+        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
+        pil_output = Image.fromarray(crop_output)
+        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
+        np_output = np.array(pil_output, dtype=np.uint8)
+        np_output[np_output < 128] = 0
+        np_output[np_output >= 128] = 255
+        np_output = np.uint8(np_output)
+        return np_output
+
+    def forward(self, image, text):
+        """
+        image should be numpy array, dtype=np.uint8, shape: height*width*3
+        """
+        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
+            image, size=640)
+        pred = self.inference(image_tensor, text)
+        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=640)
+        outputs = {OutputKeys.MASKS: msk}
+        return outputs
+
+    def inference(self, image, text):
+        """
+        image should be tensor, 1 * 3 * 640 * 640
+        """
+        with torch.no_grad():
+            if self.device_id == -1:
+                output = self.model(image)
+            else:
+                device = torch.device('cuda', self.device_id)
+                output = self.model(image.to(device), [text])
+            output = F.interpolate(output, size=(640, 640), mode='bilinear')
+            output = F.softmax(output, dim=1)
+            output = torch.argmax(output, dim=1)
+            output = output[0]
+            if self.device_id == -1:
+                pred = output.data.numpy()
+            else:
+                pred = output.data.cpu().numpy()
+            del output
+        return pred
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_net.py b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
new file mode 100644
index 00000000..1a558c5c
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_net.py
@@ -0,0 +1,197 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from . import clip
+from .lseg_blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
+                          Interpolate, _make_encoder, forward_vit)
+from .simple_tokenizer import SimpleTokenizer
+
+
+class depthwise_clipseg_conv(nn.Module):
+
+    def __init__(self):
+        super(depthwise_clipseg_conv, self).__init__()
+        self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1)
+
+    def depthwise_clipseg(self, x, channels):
+        x = torch.cat(
+            [self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)],
+            dim=1)
+        return x
+
+    def forward(self, x):
+        channels = x.shape[1]
+        out = self.depthwise_clipseg(x, channels)
+        return out
+
+
+class depthwise_conv(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1):
+        super(depthwise_conv, self).__init__()
+        self.depthwise = nn.Conv2d(
+            1, 1, kernel_size=kernel_size, stride=stride, padding=padding)
+
+    def forward(self, x):
+        # support for 4D tensor with NCHW
+        C, H, W = x.shape[1:]
+        x = x.reshape(-1, 1, H, W)
+        x = self.depthwise(x)
+        x = x.view(-1, C, H, W)
+        return x
+
+
+class depthwise_block(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(depthwise_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+
+    def forward(self, x, act=True):
+        x = self.depthwise(x)
+        if act:
+            x = self.activation(x)
+        return x
+
+
+class bottleneck_block(nn.Module):
+
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(bottleneck_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+
+    def forward(self, x, act=True):
+        sum_layer = x.max(dim=1, keepdim=True)[0]
+        x = self.depthwise(x)
+        x = x + sum_layer
+        if act:
+            x = self.activation(x)
+        return x
+
+
+class BaseModel(torch.nn.Module):
+
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+
+        if 'optimizer' in parameters:
+            parameters = parameters['model']
+
+        self.load_state_dict(parameters)
+
+
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        activation=nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+
+
+class LSeg(BaseModel):
+
+    def __init__(
+        self,
+        features=256,
+        backbone='clip_vitl16_384',
+        readout='project',
+        use_bn=True,
+        model_dir=None,
+    ):
+        super(LSeg, self).__init__()
+        hooks = {
+            'clip_vitl16_384': [5, 11, 17, 23],
+        }
+
+        # Instantiate backbone and reassemble blocks
+        self.clip_pretrained, self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+        self.logit_scale = nn.Parameter(torch.ones([])
+                                        * np.log(1 / 0.07)).exp()
+        self.out_c = 512
+        self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1)
+
+        self.scratch.output_conv = nn.Sequential(
+            Interpolate(scale_factor=2, mode='bilinear', align_corners=True), )
+
+        self.tau = 0.07
+        self.model_dir = model_dir
+        self.tokenizer = SimpleTokenizer(model_dir
+                                         + '/bpe_simple_vocab_16e6.txt.gz')
+
+    def forward(self, x, labelset=''):
+        text = clip.tokenize(self.tokenizer, labelset)
+
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+        text = text.to(x.device)
+        text_features = self.clip_pretrained.encode_text(text)
+
+        image_features = self.scratch.head1(path_1)
+
+        imshape = image_features.shape
+        image_features = image_features.permute(0, 2, 3,
+                                                1).reshape(-1, self.out_c)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+
+        logits_per_image = image_features @ text_features.t() / self.tau
+
+        out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3],
+                                            -1).permute(0, 3, 1, 2)
+
+        out = self.scratch.output_conv(out)
+
+        return out
diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_vit.py b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
new file mode 100644
index 00000000..be2813c2
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_vit.py
@@ -0,0 +1,543 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+import math
+import types
+
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from . import clip
+
+activations = {}
+
+
+def get_activation(name):
+
+    def hook(model, input, output):
+        activations[name] = output
+
+    return hook
+
+
+attention = {}
+
+
+def get_attention(name):
+
+    def hook(module, input, output):
+        x = input[0]
+        B, N, C = x.shape
+        qkv = (
+            module.qkv(x).reshape(B, N, 3, module.num_heads,
+                                  C // module.num_heads).permute(
+                                      2, 0, 3, 1, 4))
+        q, k, _ = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * module.scale
+
+        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
+        attention[name] = attn
+
+    return hook
+
+
+def get_mean_attention_map(attn, token, shape):
+    attn = attn[:, :, token, 1:]
+    attn = attn.unflatten(2, torch.Size([shape[2] // 16,
+                                         shape[3] // 16])).float()
+    attn = torch.nn.functional.interpolate(
+        attn, size=shape[2:], mode='bicubic', align_corners=False).squeeze(0)
+
+    all_attn = torch.mean(attn, 0)
+
+    return all_attn
+
+
+class Slice(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        return x[:, self.start_index:]
+
+
+class AddReadout(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class ProjectReadout(nn.Module):
+
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+
+        self.project = nn.Sequential(
+            nn.Linear(2 * in_features, in_features), nn.GELU())
+
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+
+        return self.project(features)
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+
+
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+
+    # encoder
+    _ = pretrained.model.forward_flex(x)
+
+    layer_1 = pretrained.activations['1']
+    layer_2 = pretrained.activations['2']
+    layer_3 = pretrained.activations['3']
+    layer_4 = pretrained.activations['4']
+
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size([
+                h // pretrained.model.patch_size[1],
+                w // pretrained.model.patch_size[0],
+            ]),
+        ))
+
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+
+    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
+        layer_1)
+    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
+        layer_2)
+    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
+        layer_3)
+    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
+        layer_4)
+
+    return layer_1, layer_2, layer_3, layer_4
+
+
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :self.start_index],
+        posemb[0, self.start_index:],
+    )
+
+    gs_old = int(math.sqrt(len(posemb_grid)))
+
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=(gs_h, gs_w), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+    return posemb
+
+
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+
+    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
+                                       w // self.patch_size[0])
+
+    B = x.shape[0]
+
+    if hasattr(self.patch_embed, 'backbone'):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[
+                -1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+
+    if getattr(self, 'dist_token', None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+
+    x = x + pos_embed
+    x = self.pos_drop(x)
+
+    gradient_checkpoint = False
+    for blk in self.blocks:
+        if gradient_checkpoint:
+            x = checkpoint.checkpoint(blk, x)
+        else:
+            x = blk(x)
+
+    x = self.norm(x)
+
+    return x
+
+
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == 'ignore':
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == 'add':
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == 'project':
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+
+    return readout_oper
+
+
+def adapt_input_conv(in_chans, conv_weight):
+    conv_type = conv_weight.dtype
+    conv_weight = conv_weight.float(
+    )  # Some weights are in torch.half, ensure it's float for sum on CPU
+    O, II, J, K = conv_weight.shape
+    if in_chans == 1:
+        if II > 3:
+            assert conv_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv_weight = conv_weight.reshape(O, II // 3, 3, J, K)
+            conv_weight = conv_weight.sum(dim=2, keepdim=False)
+        else:
+            conv_weight = conv_weight.sum(dim=1, keepdim=True)
+    elif in_chans != 3:
+        if II != 3:
+            raise NotImplementedError(
+                'Weight format not supported by conversion.')
+        else:
+            # NOTE this strategy should be better than random init, but there could be other combinations of
+            # the original RGB input layer weights that'd work better for specific cases.
+            repeat = int(math.ceil(in_chans / 3))
+            conv_weight = conv_weight.repeat(1, repeat, 1,
+                                             1)[:, :in_chans, :, :]
+            conv_weight *= (3 / float(in_chans))
+    conv_weight = conv_weight.to(conv_type)
+    return conv_weight
+
+
+@torch.no_grad()
+def _load_weights(model, checkpoint_path, prefix=''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(
+            adapt_input_conv(stem.conv.weight.shape[1],
+                             _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(
+                            _n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(
+                            _n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(
+                            _n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(
+                            _n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(model.patch_embed.proj.weight.shape[1],
+                                        _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(
+        w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_prefix_tokens',
+                                                  1),
+            model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+    if isinstance(
+            model.head, nn.Linear
+    ) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+        model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+        model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
+    # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+    #     model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+    #     model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.qkv.bias.copy_(
+            torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1)
+                for n in ('query', 'key', 'value')
+            ]))
+        block.attn.proj.weight.copy_(
+            _n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(
+                _n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+
+
+def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    ntok_new = posemb_new.shape[1]
+    if num_prefix_tokens:
+        posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[
+            0, num_prefix_tokens:]
+        ntok_new -= num_prefix_tokens
+    else:
+        posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    if not len(gs_new):  # backwards compatibility
+        gs_new = [int(math.sqrt(ntok_new))] * 2
+    assert len(gs_new) >= 2
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=gs_new, mode='bicubic', align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3,
+                                      1).reshape(1, gs_new[0] * gs_new[1], -1)
+    posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
+    return posemb
+
+
+def _make_pretrained_clip_vitl16_384(pretrained,
+                                     use_readout='ignore',
+                                     hooks=None,
+                                     enable_attention_hooks=False):
+    clip_pretrained, _ = clip.load('ViT-B/32', device='cpu', jit=False)
+
+    # model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    model = timm.create_model('vit_large_patch16_384', pretrained=False)
+    hooks = [5, 11, 17, 23] if hooks is None else hooks
+    pretrained = _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+    return clip_pretrained, pretrained
+
+
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout='ignore',
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(
+        get_activation('1'))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(
+        get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention('attn_1'))
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention('attn_2'))
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention('attn_3'))
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention('attn_4'))
+        pretrained.attention = attention
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
diff --git a/modelscope/models/cv/text_driven_segmentation/model.py b/modelscope/models/cv/text_driven_segmentation/model.py
new file mode 100644
index 00000000..ece10bab
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/model.py
@@ -0,0 +1,458 @@
+"""
+Adapted from https://github.com/isl-org/lang-seg.
+Originally MIT License, Copyright (c) 2021 Intelligent Systems Lab Org.
+"""
+
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu3(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1],
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        return x.squeeze(0)
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, width, layers, heads, attn_mask=None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x1 = self.class_embedding.to(x.dtype)
+        x2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([x1 + x2, x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(nn.Module):
+
+    def __init__(
+            self,
+            embed_dim: int,
+            # vision
+            image_resolution: int,
+            vision_layers: Union[Tuple[int, int, int, int], int],
+            vision_width: int,
+            vision_patch_size: int,
+            # text
+            context_length: int,
+            vocab_size: int,
+            transformer_width: int,
+            transformer_heads: int,
+            transformer_layers: int):
+        super().__init__()
+
+        self.context_length = context_length
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim)
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith('bn3.weight'):
+                        nn.init.zeros_(param)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(
+                self.text_projection, std=self.transformer.width**-0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(ll):
+        if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            ll.weight.data = ll.weight.data.half()
+            if ll.bias is not None:
+                ll.bias.data = ll.bias.data.half()
+
+        if isinstance(ll, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(ll, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(ll, name):
+                attr = getattr(ll, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model():
+    model = CLIP(512, 224, 12, 768, 32, 77, 49408, 512, 8, 12)
+    convert_weights(model)
+    return model.eval()
diff --git a/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
new file mode 100644
index 00000000..250d680f
--- /dev/null
+++ b/modelscope/models/cv/text_driven_segmentation/simple_tokenizer.py
@@ -0,0 +1,156 @@
+""" CLIP
+Adapted from https://github.com/openai/CLIP.
+Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            error_list = []
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception as err:
+                    new_word.extend(word[i:])
+                    error_list.append(err)
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 7d6cdb59..6fada2b0 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -243,6 +243,13 @@ TASK_OUTPUTS = {
     #    "output_img": np.ndarray with shape [height, width, 3]
     # }
     Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
+    # text driven segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    #   }
+    Tasks.text_driven_segmentation: [OutputKeys.MASKS],
 
     # movide scene segmentation result for a single video
     # {
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index c9f0c252..40c237c8 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -149,6 +149,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_vitb_video-single-object-tracking_ostrack'),
     Tasks.image_reid_person: (Pipelines.image_reid_person,
                               'damo/cv_passvitb_image-reid-person_market'),
+    Tasks.text_driven_segmentation:
+    (Pipelines.text_driven_segmentation,
+     'damo/cv_vitl16_segmentation_text-driven-seg'),
     Tasks.movie_scene_segmentation:
     (Pipelines.movie_scene_segmentation,
      'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index f4e6792b..c8cb0c6a 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -44,6 +44,7 @@ if TYPE_CHECKING:
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
+    from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
 
 else:
@@ -97,6 +98,8 @@ else:
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
         'easycv_pipeline':
         ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
+        'text_driven_segmentation_pipeline':
+        ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
     }
diff --git a/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
new file mode 100644
index 00000000..0985b835
--- /dev/null
+++ b/modelscope/pipelines/cv/text_driven_segmentation_pipleline.py
@@ -0,0 +1,51 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_driven_segmentation,
+    module_name=Pipelines.text_driven_segmentation)
+class TextDrivenSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Dict) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input['image'])
+        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
+        result = {
+            'img': img_tensor,
+            'ori_h': ori_h,
+            'ori_w': ori_w,
+            'crop_h': crop_h,
+            'crop_w': crop_w,
+            'text': input['text'],
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = self.model.inference(input['img'], input['text'])
+        result = {
+            'data': outputs,
+            'ori_h': input['ori_h'],
+            'ori_w': input['ori_w'],
+            'crop_h': input['crop_h'],
+            'crop_w': input['crop_w'],
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
+                                      inputs['crop_w'], inputs['ori_h'],
+                                      inputs['ori_w'])
+        outputs = {OutputKeys.MASKS: data}
+        return outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2265ef5a..ed1ec798 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -36,6 +36,7 @@ class CVTasks(object):
 
     image_segmentation = 'image-segmentation'
     portrait_matting = 'portrait-matting'
+    text_driven_segmentation = 'text-driven-segmentation'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_text_driven_segmentation.py b/tests/pipelines/test_text_driven_segmentation.py
new file mode 100644
index 00000000..741787d9
--- /dev/null
+++ b/tests/pipelines/test_text_driven_segmentation.py
@@ -0,0 +1,28 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TextDrivenSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_text_driven_segmentation(self):
+        input_location = 'data/test/images/text_driven_segmentation.jpg'
+        test_input = {
+            'image': input_location,
+            'text': 'bear',
+        }
+        model_id = 'damo/cv_vitl16_segmentation_text-driven-seg'
+        shop_seg = pipeline(Tasks.text_driven_segmentation, model=model_id)
+        result = shop_seg(test_input)
+        import cv2
+        # result[OutputKeys.MASKS] is segment map result,other keys are not used
+        cv2.imwrite(input_location + '_lseg.jpg', result[OutputKeys.MASKS])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 5a2634610a3e1efca692327ab31988313574156d Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Fri, 2 Sep 2022 20:03:19 +0800
Subject: [PATCH 09/28] [to #42322933]skip sbert_en&bert_ch to save ci time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

![](https://cn-hangzhou.oss-cdn.aliyun-inc.com/git/force/uploads/comment/251924/40165669611078357/image.png)
fill mask pipeline 测试时间过长

这个task测了4个模型。从保证代码正确性的功能角度看，只测一个bert类(比如sbert中文），一个roberta类（veco)。减少测试的模型数量以减少测试时长。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006556

    * skip sbert_en&bert_ch to save ci time
---
 tests/pipelines/test_fill_mask.py | 38 ++-----------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 1b709e27..6b37f6df 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -43,7 +43,7 @@ class FillMaskTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         # sbert
-        for language in ['zh', 'en']:
+        for language in ['zh']:
             model_dir = snapshot_download(self.model_id_sbert[language])
             preprocessor = FillMaskPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
@@ -74,24 +74,10 @@ class FillMaskTest(unittest.TestCase):
                 f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
             )
 
-        # zh bert
-        language = 'zh'
-        model_dir = snapshot_download(self.model_id_bert)
-        preprocessor = FillMaskPreprocessor(
-            model_dir, first_sequence='sentence', second_sequence=None)
-        model = BertForMaskedLM.from_pretrained(model_dir)
-        pipeline1 = FillMaskPipeline(model, preprocessor)
-        pipeline2 = pipeline(
-            Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        ori_text = self.ori_texts[language]
-        test_input = self.test_inputs[language]
-        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
-              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         # sbert
-        for language in ['zh', 'en']:
+        for language in ['zh']:
             print(self.model_id_sbert[language])
             model = Model.from_pretrained(self.model_id_sbert[language])
             preprocessor = FillMaskPreprocessor(
@@ -121,20 +107,6 @@ class FillMaskTest(unittest.TestCase):
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
 
-        # zh bert
-        model = Model.from_pretrained(self.model_id_bert)
-        preprocessor = FillMaskPreprocessor(
-            model.model_dir, first_sequence='sentence', second_sequence=None)
-        pipeline_ins = pipeline(
-            Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        language = 'zh'
-        ori_text = self.ori_texts[language]
-        test_input = self.test_inputs[language]
-        with self.regress_tool.monitor_module_single_forward(
-                pipeline_ins.model, 'fill_mask_bert_zh'):
-            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
-                  f'{pipeline_ins(test_input)}\n')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
@@ -153,12 +125,6 @@ class FillMaskTest(unittest.TestCase):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
-        # bert
-        pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
-        print(
-            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-            f'{pipeline_ins(self.test_inputs[language])}\n')
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.fill_mask)

From 4073376f512af16fb62814bade1482d2deb55236 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Fri, 2 Sep 2022 20:53:29 +0800
Subject: [PATCH 10/28] [to #42322933]add face 2d keypoints by EasyCV        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9934673

    * add face 2d keypoints
---
 .../test_img_face_2d_keypoints.png            |  3 ++
 modelscope/metainfo.py                        |  3 ++
 modelscope/models/cv/__init__.py              |  6 +--
 .../models/cv/face_2d_keypoints/__init__.py   | 20 +++++++++
 .../face_2d_keypoints_align.py                | 16 ++++++++
 .../cv/face_2d_keypoins/__init__.py           | 20 +++++++++
 .../face_2d_keypoints_dataset.py              | 13 ++++++
 modelscope/outputs.py                         |  9 ++++
 modelscope/pipelines/builder.py               |  2 +
 modelscope/pipelines/cv/__init__.py           |  8 ++--
 .../pipelines/cv/easycv_pipelines/__init__.py |  4 +-
 .../face_2d_keypoints_pipeline.py             | 41 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 tests/pipelines/test_face_2d_keypoints.py     | 36 ++++++++++++++++
 14 files changed, 175 insertions(+), 7 deletions(-)
 create mode 100644 data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
 create mode 100644 modelscope/models/cv/face_2d_keypoints/__init__.py
 create mode 100644 modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
 create mode 100644 modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
 create mode 100644 modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
 create mode 100644 tests/pipelines/test_face_2d_keypoints.py

diff --git a/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
new file mode 100644
index 00000000..00311c33
--- /dev/null
+++ b/data/test/images/keypoints_detect/test_img_face_2d_keypoints.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:331ead75033fa2f01f6be72a2f8e34d581fcb593308067815d4bb136bb13b766
+size 54390
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 3225710a..06b5a476 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -24,6 +24,7 @@ class Models(object):
     body_2d_keypoints = 'body-2d-keypoints'
     body_3d_keypoints = 'body-3d-keypoints'
     crowd_counting = 'HRNetCrowdCounting'
+    face_2d_keypoints = 'face-2d-keypoints'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
     video_summarization = 'pgl-video-summarization'
@@ -112,6 +113,7 @@ class Pipelines(object):
     object_detection = 'vit-object-detection'
     easycv_detection = 'easycv-detection'
     easycv_segmentation = 'easycv-segmentation'
+    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
@@ -353,6 +355,7 @@ class Datasets(object):
     """ Names for different datasets.
     """
     ClsDataset = 'ClsDataset'
+    Face2dKeypointsDataset = 'Face2dKeypointsDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 331f23bd..4db43d17 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -3,9 +3,9 @@
 # yapf: disable
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
-               crowd_counting, face_detection, face_generation,
-               image_classification, image_color_enhance, image_colorization,
-               image_denoise, image_instance_segmentation,
+               crowd_counting, face_2d_keypoints, face_detection,
+               face_generation, image_classification, image_color_enhance,
+               image_colorization, image_denoise, image_instance_segmentation,
                image_panoptic_segmentation, image_portrait_enhancement,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
diff --git a/modelscope/models/cv/face_2d_keypoints/__init__.py b/modelscope/models/cv/face_2d_keypoints/__init__.py
new file mode 100644
index 00000000..636ba0f4
--- /dev/null
+++ b/modelscope/models/cv/face_2d_keypoints/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .face_2d_keypoints_align import Face2DKeypoints
+
+else:
+    _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
new file mode 100644
index 00000000..468662a0
--- /dev/null
+++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.face.face_keypoint import FaceKeypoint
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
+class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        FaceKeypoint.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
new file mode 100644
index 00000000..e9d76b7e
--- /dev/null
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .face_2d_keypoints_dataset import FaceKeypointDataset
+
+else:
+    _import_structure = {'face_2d_keypoints_dataset': ['FaceKeypointDataset']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
new file mode 100644
index 00000000..a902999d
--- /dev/null
+++ b/modelscope/msdatasets/cv/face_2d_keypoins/face_2d_keypoints_dataset.py
@@ -0,0 +1,13 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.face import FaceKeypointDataset as _FaceKeypointDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.face_2d_keypoints,
+    module_name=Datasets.Face2dKeypointsDataset)
+class FaceKeypointDataset(_FaceKeypointDataset):
+    """EasyCV dataset for face 2d keypoints."""
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 6fada2b0..e84c8dcc 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -57,6 +57,15 @@ TASK_OUTPUTS = {
     # }
     Tasks.ocr_recognition: [OutputKeys.TEXT],
 
+    # face 2d keypoint result for single sample
+    #   {
+    #       "keypoints": [
+    #           [x1, y1]*106
+    #       ],
+    #       "poses": [pitch, roll, yaw]
+    #   }
+    Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],
+
     # face detection result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05]
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 40c237c8..f43d152b 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -103,6 +103,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
+    Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
+                              'damo/cv_mobilenet_face-2d-keypoints_alignment'),
     Tasks.video_multi_modal_embedding:
     (Pipelines.video_multi_modal_embedding,
      'damo/multi_modal_clip_vtretrival_msrvtt_53'),
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index c8cb0c6a..9e7d80ee 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -43,7 +43,7 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
-    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline
+    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
 
@@ -96,8 +96,10 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
-        'easycv_pipeline':
-        ['EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline'],
+        'easycv_pipeline': [
+            'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
+            'Face2DKeypointsPipeline'
+        ],
         'text_driven_segmentation_pipeline':
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
index 0984ff43..4f149130 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/__init__.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .detection_pipeline import EasyCVDetectionPipeline
     from .segmentation_pipeline import EasyCVSegmentationPipeline
+    from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
 else:
     _import_structure = {
         'detection_pipeline': ['EasyCVDetectionPipeline'],
-        'segmentation_pipeline': ['EasyCVSegmentationPipeline']
+        'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
+        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
new file mode 100644
index 00000000..eb4d6c15
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.face_2d_keypoints, module_name=Pipelines.face_2d_keypoints)
+class Face2DKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for face 2d keypoints detection."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+
+        super(Face2DKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def show_result(self, img, points, scale=2, save_path=None):
+        return self.predict_op.show_result(img, points, scale, save_path)
+
+    def __call__(self, inputs) -> Any:
+        output = self.predict_op(inputs)[0][0]
+        points = output['point']
+        poses = output['pose']
+
+        return {OutputKeys.KEYPOINTS: points, OutputKeys.POSES: poses}
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index ed1ec798..86808ea1 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -20,6 +20,7 @@ class CVTasks(object):
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
     face_recognition = 'face-recognition'
+    face_2d_keypoints = 'face-2d-keypoints'
     human_detection = 'human-detection'
     human_object_interaction = 'human-object-interaction'
     face_image_generation = 'face-image-generation'
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
new file mode 100644
index 00000000..a5e347e8
--- /dev/null
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_face_2d_keypoints(self):
+        img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png'
+        model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment'
+
+        face_2d_keypoints_align = pipeline(
+            task=Tasks.face_2d_keypoints, model=model_id)
+        output = face_2d_keypoints_align(img_path)
+
+        output_keypoints = output[OutputKeys.KEYPOINTS]
+        output_pose = output[OutputKeys.POSES]
+
+        img = cv2.imread(img_path)
+        img = face_2d_keypoints_align.show_result(
+            img, output_keypoints, scale=2, save_path='face_keypoints.jpg')
+
+        self.assertEqual(output_keypoints.shape[0], 106)
+        self.assertEqual(output_keypoints.shape[1], 2)
+        self.assertEqual(output_pose.shape[0], 3)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 00487aa6e1ca1b7ac50b5ca90b3290f2a6068d77 Mon Sep 17 00:00:00 2001
From: "xixing.tj" <xixing.tj@alibaba-inc.com>
Date: Sat, 3 Sep 2022 11:38:07 +0800
Subject: [PATCH 11/28] [to #42322933]add error msg when no text detected for
 ocr_detection task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ocr_detection加上当图片中没有文字时报错的error msg
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10001490
---
 modelscope/pipelines/cv/ocr_detection_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 62248714..b73f65a4 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -149,6 +149,8 @@ class OCRDetectionPipeline(Pipeline):
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         rboxes = inputs['combined_rboxes'][0]
         count = inputs['combined_counts'][0]
+        if count == 0 or count < rboxes.shape[0]:
+            raise Exception('modelscope error: No text detected')
         rboxes = rboxes[:count, :]
 
         # convert rboxes to polygons and find its coordinates on the original image

From 4f72134adf6f6154e5eb02602b33f2066426dbe4 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Sat, 3 Sep 2022 11:50:01 +0800
Subject: [PATCH 12/28] [to #42322933]update test video for movie scene
 segmentation         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10007852

    * update test video for movie scene segmentation
---
 data/test/videos/movie_scene_segmentation_test_video.mp4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/test/videos/movie_scene_segmentation_test_video.mp4 b/data/test/videos/movie_scene_segmentation_test_video.mp4
index ee6ed528..21ea3cb1 100644
--- a/data/test/videos/movie_scene_segmentation_test_video.mp4
+++ b/data/test/videos/movie_scene_segmentation_test_video.mp4
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59fa397b01dc4c9b67a19ca42f149287b9c4e7b2158aba5d07d2db88af87b23f
-size 126815483
+oid sha256:03002807dc2aa180c3ae104e764c7a4d6c421d186a5d552f97d338467ae6c443
+size 12722029

From ba74cdf97e8944e724b78cdfaf43f2de0fed721b Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sat, 3 Sep 2022 12:10:16 +0800
Subject: [PATCH 13/28] [to #43878347] Rename runtime.txt  to framework.txt    
     Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10000642

    * rename runtime.txt  to framework.txt
---
 .readthedocs.yaml                           | 2 +-
 docker/Dockerfile.ubuntu                    | 2 +-
 requirements.txt                            | 2 +-
 requirements/{runtime.txt => framework.txt} | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename requirements/{runtime.txt => framework.txt} (100%)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index b88d734a..f7b9c7ea 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -25,4 +25,4 @@ python:
   install:
     - requirements: requirements/docs.txt
     - requirements: requirements/readthedocs.txt
-    - requirements: requirements/runtime.txt
+    - requirements: requirements/framework.txt
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 97881007..78da0b6f 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -64,7 +64,7 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
 # install modelscope
 COPY requirements /var/modelscope
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /var/modelscope/runtime.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
diff --git a/requirements.txt b/requirements.txt
index c6e294ba..0832e6ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
--r requirements/runtime.txt
+-r requirements/framework.txt
diff --git a/requirements/runtime.txt b/requirements/framework.txt
similarity index 100%
rename from requirements/runtime.txt
rename to requirements/framework.txt

From 39a309b6554070e68741a36593211ab47910a293 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sat, 3 Sep 2022 12:18:29 +0800
Subject: [PATCH 14/28]  [to #42322933] reduce train epoch from 3 to w

---
 tests/trainers/test_finetune_mplug.py               | 2 +-
 tests/trainers/test_finetune_token_classificatin.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index 351600c6..b46dbf45 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -35,7 +35,7 @@ class TestFinetuneMPlug(unittest.TestCase):
             }).rename_column('image:FILE',
                              'image').rename_column('answer:Value', 'answer'))
 
-        self.max_epochs = 3
+        self.max_epochs = 2
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
diff --git a/tests/trainers/test_finetune_token_classificatin.py b/tests/trainers/test_finetune_token_classificatin.py
index c34410be..9bdab9b7 100644
--- a/tests/trainers/test_finetune_token_classificatin.py
+++ b/tests/trainers/test_finetune_token_classificatin.py
@@ -92,7 +92,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
                 }
             }
             cfg['preprocessor'] = {'type': 'token-cls-tokenizer'}
-            cfg.train.max_epochs = 3
+            cfg.train.max_epochs = 2
             cfg.train.lr_scheduler = {
                 'type': 'LinearLR',
                 'start_factor': 1.0,

From 04516276265f27996b2ffb293f3ef6315055d0d7 Mon Sep 17 00:00:00 2001
From: "xingguang.zxg" <xingguang.zxg@alibaba-inc.com>
Date: Sat, 3 Sep 2022 13:21:31 +0800
Subject: [PATCH 15/28] =?UTF-8?q?[to=20#42322933]=E5=95=86=E5=93=81?=
 =?UTF-8?q?=E6=98=BE=E8=91=97=E6=80=A7=E5=88=86=E5=89=B2v1.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

商品显著性检测模型，依赖opencv，mmcv-full
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9909897
---
 data/test/images/shop_segmentation.jpg        |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |   2 +-
 .../models/cv/shop_segmentation/__init__.py   |   1 +
 .../models/cv/shop_segmentation/common.py     |  59 ++
 .../models/cv/shop_segmentation/head_fpn.py   | 122 +++
 .../models/cv/shop_segmentation/models.py     | 901 ++++++++++++++++++
 .../models/cv/shop_segmentation/neck_fpn.py   | 217 +++++
 .../cv/shop_segmentation/shop_seg_base.py     | 157 +++
 .../cv/shop_segmentation/shop_seg_model.py    | 115 +++
 .../models/cv/shop_segmentation/utils.py      | 199 ++++
 modelscope/outputs.py                         |   8 +-
 modelscope/pipelines/builder.py               |   4 +-
 modelscope/pipelines/cv/__init__.py           |   3 +-
 .../cv/shop_segmentation_pipleline.py         |  51 +
 modelscope/utils/constant.py                  |   1 +
 tests/pipelines/test_shop_segmentation.py     |  24 +
 17 files changed, 1865 insertions(+), 4 deletions(-)
 create mode 100644 data/test/images/shop_segmentation.jpg
 create mode 100644 modelscope/models/cv/shop_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/shop_segmentation/common.py
 create mode 100644 modelscope/models/cv/shop_segmentation/head_fpn.py
 create mode 100644 modelscope/models/cv/shop_segmentation/models.py
 create mode 100644 modelscope/models/cv/shop_segmentation/neck_fpn.py
 create mode 100644 modelscope/models/cv/shop_segmentation/shop_seg_base.py
 create mode 100644 modelscope/models/cv/shop_segmentation/shop_seg_model.py
 create mode 100644 modelscope/models/cv/shop_segmentation/utils.py
 create mode 100644 modelscope/pipelines/cv/shop_segmentation_pipleline.py
 create mode 100644 tests/pipelines/test_shop_segmentation.py

diff --git a/data/test/images/shop_segmentation.jpg b/data/test/images/shop_segmentation.jpg
new file mode 100644
index 00000000..ec02881d
--- /dev/null
+++ b/data/test/images/shop_segmentation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ecc371c8b0ca09d0e11df89bc549000937eafc451929586426fe657ade25a0
+size 238607
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 06b5a476..b1bf9600 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    shop_segmentation = 'shop-segmentation'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -148,6 +149,7 @@ class Pipelines(object):
     image_reid_person = 'passvitb-image-reid-person'
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
+    shop_segmentation = 'shop-segmentation'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 4db43d17..f2798b59 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -11,7 +11,7 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_to_image_generation, image_to_image_translation,
                movie_scene_segmentation, object_detection,
                product_retrieval_embedding, realtime_object_detection,
-               salient_detection, super_resolution,
+               salient_detection, shop_segmentation, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py
new file mode 100644
index 00000000..b40a0760
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/__init__.py
@@ -0,0 +1 @@
+from .shop_seg_base import SHOPSEG
diff --git a/modelscope/models/cv/shop_segmentation/common.py b/modelscope/models/cv/shop_segmentation/common.py
new file mode 100644
index 00000000..00ba9996
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/common.py
@@ -0,0 +1,59 @@
+"""
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > input_w:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
new file mode 100644
index 00000000..b3faa9b8
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -0,0 +1,122 @@
+""" FPNHead
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from timm.models.layers import drop, drop_path, trunc_normal_
+
+from .common import Upsample, resize
+
+
+class FPNHead(nn.Module):
+    """Panoptic Feature Pyramid Networks.
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self,
+                 channels,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 feature_strides=[4, 8, 16, 32],
+                 align_corners=False,
+                 **kwargs):
+        super(FPNHead, self).__init__()
+        self.act_cfg = dict(type='ReLU')
+        self.channels = channels
+        self.conv_cfg = None
+        self.norm_cfg = None
+        self.norm_cfg = dict(type='BN2d', requires_grad=True)
+        self.align_corners = align_corners
+        self.dropout_ratio = dropout_ratio
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.in_index = [0, 1, 2, 3]
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+        self.apply(self._init_weights)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        inputs = [inputs[i] for i in self.in_index]
+        return inputs
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0)
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
new file mode 100644
index 00000000..8b82d1d1
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -0,0 +1,901 @@
+"""
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import math
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop, drop_path, trunc_normal_
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.spacial_dim = spacial_dim
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = x.reshape(x.shape[0], x.shape[1],
+                      x.shape[2] * x.shape[3]).permute(2, 0,
+                                                       1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+
+        cls_pos = self.positional_embedding[0:1, :]
+        spatial_pos = F.interpolate(
+            self.positional_embedding[1:, ].reshape(1, self.spacial_dim,
+                                                    self.spacial_dim,
+                                                    self.embed_dim).permute(
+                                                        0, 3, 1, 2),
+            size=(H, W),
+            mode='bilinear')
+        spatial_pos = spatial_pos.reshape(self.embed_dim, H * W).permute(1, 0)
+        positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
+
+        x = x + positional_embedding[:, None, :]
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+
+        x = x.permute(1, 2, 0)
+        global_feat = x[:, :, 0]
+        feature_map = x[:, :, 1:].reshape(B, -1, H, W)
+        return global_feat, feature_map
+
+
+class CLIPResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim=512,
+                 input_resolution=224,
+                 width=64,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in CLIPResNet')
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+
+        outs = []
+        x = self.layer1(x)
+        outs.append(x)
+        x = self.layer2(x)
+        outs.append(x)
+        x = self.layer3(x)
+        outs.append(x)
+        x = self.layer4(x)
+        outs.append(x)
+
+        return tuple(outs)
+
+
+class CLIPResNetWithAttention(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim=1024,
+                 input_resolution=224,
+                 width=64,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32,
+                                        output_dim)
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+                    if 'positional_embedding' in new_k:
+                        if self.attnpool.positional_embedding.shape != state_dict[
+                                new_k].shape:
+                            print(
+                                f'Resize the pos_embed shape from {state_dict[new_k].shape}'
+                                f' to {self.attnpool.positional_embedding.shape}'
+                            )
+                            cls_pos = state_dict[new_k][0:1, :]
+                            H = W = self.input_resolution // 32
+                            old_h = int(
+                                math.sqrt(state_dict[new_k][1:, ].shape[0]))
+                            spatial_pos = F.interpolate(
+                                state_dict[new_k][1:, ].reshape(
+                                    1, old_h, old_h,
+                                    cls_pos.shape[1]).permute(0, 3, 1, 2),
+                                size=(H, W),
+                                mode='bilinear')
+                            spatial_pos = spatial_pos.reshape(
+                                cls_pos.shape[1], H * W).permute(1, 0)
+                            positional_embedding = torch.cat(
+                                [cls_pos, spatial_pos], dim=0)
+                            state_dict[new_k] = positional_embedding
+                            assert self.attnpool.positional_embedding.shape == state_dict[
+                                new_k].shape
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in CLIPResNet')
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+
+        outs = []
+        x = self.layer1(x)
+        outs.append(x)
+        x = self.layer2(x)
+        outs.append(x)
+        x = self.layer3(x)
+        outs.append(x)
+        x = self.layer4(x)
+        outs.append(x)
+
+        x_global, x_local = self.attnpool(x)
+        outs.append([x_global, x_local])
+
+        return tuple(outs)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path=0.):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.drop_path(self.attention(self.ln_1(x)))
+        x = x + self.drop_path(self.mlp(self.ln_2(x)))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 drop_path_rate=0.):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)
+               ]  # stochastic depth decay rule
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask, dpr[i])
+            for i in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, q, k, v):
+        B, N, C = q.shape
+        assert k.shape == v.shape
+        B, M, C = k.shape
+        q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
+        k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
+        v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)
+
+        attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale
+
+        attn = attn.softmax(dim=-1)
+
+        x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
+        self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Dropout(dropout),
+            nn.Linear(d_model * 4, d_model))
+
+    def forward(self, x, mem):
+        q = k = v = self.norm1(x)
+        x = x + self.self_attn(q, k, v)
+        q = self.norm2(x)
+        x = x + self.cross_attn(q, mem, mem)
+        x = x + self.dropout(self.mlp(self.norm3(x)))
+        return x
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(self,
+                 input_resolution=224,
+                 patch_size=32,
+                 width=768,
+                 layers=12,
+                 heads=12,
+                 output_dim=512,
+                 drop_path_rate=0.0,
+                 out_indices=[3, 5, 7, 11],
+                 pretrained=None,
+                 get_embeddings=False,
+                 **kwargs):
+        super().__init__()
+        self.pretrained = pretrained
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.spatial_size = input_resolution // patch_size
+        self.ln_pre = LayerNorm(width)
+        self.get_embeddings = get_embeddings
+
+        self.transformer = Transformer(
+            width, layers, heads, drop_path_rate=drop_path_rate)
+
+        self.out_indices = out_indices
+
+        if get_embeddings:
+            self.ln_post = LayerNorm(width)
+            self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+        embed_dim = width
+
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.SyncBatchNorm(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn3 = nn.GroupNorm(1, embed_dim)
+
+            self.fpn4 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.ConvTranspose2d(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.GroupNorm(1, embed_dim)
+
+            self.fpn3 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+
+            self.fpn4 = nn.Sequential(
+                nn.GroupNorm(1, embed_dim),
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('visual.'):
+                    new_k = k.replace('visual.', '')
+                    state_dict[new_k] = checkpoint[k]
+
+            if 'positional_embedding' in state_dict.keys():
+                if self.positional_embedding.shape != state_dict[
+                        'positional_embedding'].shape:
+                    print(
+                        f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to'
+                        f' {self.positional_embedding.shape}')
+                    cls_pos = state_dict['positional_embedding'][0:1, :]
+                    spatial_pos = F.interpolate(
+                        state_dict['positional_embedding'][1:, ].reshape(
+                            1, 14, 14, 768).permute(0, 3, 1, 2),
+                        size=(self.spatial_size, self.spatial_size),
+                        mode='bilinear')
+                    spatial_pos = spatial_pos.reshape(
+                        768,
+                        self.spatial_size * self.spatial_size).permute(1, 0)
+                    positional_embedding = torch.cat([cls_pos, spatial_pos],
+                                                     dim=0)
+                    state_dict['positional_embedding'] = positional_embedding
+                    assert self.positional_embedding.shape == state_dict[
+                        'positional_embedding'].shape
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in vision transformer')
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        B, C, H, W = x.shape
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x1 = self.class_embedding.to(x.dtype)
+        x2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([x1 + x2, x], dim=1)
+        pos = self.positional_embedding.to(x.dtype)
+        cls_pos = pos[0, :] + self.class_embedding.to(x.dtype)
+        spatial_pos = F.interpolate(
+            pos[1:, ].reshape(1, self.spatial_size, self.spatial_size,
+                              C).permute(0, 3, 1, 2),
+            size=(H, W),
+            mode='bilinear')
+        spatial_pos = spatial_pos.reshape(1, C, H * W).permute(0, 2, 1)
+        pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
+        x = x + pos
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+
+        gradientcheckpoint = False
+
+        features = []
+        for i, blk in enumerate(self.transformer.resblocks):
+            if gradientcheckpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+            if i in self.out_indices:
+                xp = x.permute(1, 0, 2)[:,
+                                        1:, :].permute(0, 2,
+                                                       1).reshape(B, -1, H, W)
+                features.append(xp.contiguous())
+
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+
+        if self.get_embeddings:
+            x = x.permute(1, 0, 2)
+            x = self.ln_post(x)
+            x = x @ self.proj
+
+            global_embedding = x[:, 0]
+            visual_embedding = x[:, 1:].reshape(B, H, W,
+                                                -1).permute(0, 3, 1,
+                                                            2)  # B C H W
+
+            features.append([global_embedding, visual_embedding])
+
+        return tuple(features)
+
+
+class CLIPTextEncoder(nn.Module):
+
+    def __init__(self,
+                 context_length=77,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12,
+                 embed_dim=1024,
+                 out_dim=256,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+
+        self.pretrained = pretrained
+
+        self.context_length = context_length
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('transformer.'):
+                    state_dict[k] = checkpoint[k]
+
+                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
+                        'token_embedding') or k.startswith('ln_final'):
+                    if k == 'positional_embedding' and checkpoint[k].size(
+                            0) > self.context_length:
+                        checkpoint[k] = checkpoint[k][:self.context_length]
+                        print('positional_embedding is tuncated from 77 to',
+                              self.context_length)
+                    state_dict[k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in text encoder')
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text):
+        x = self.token_embedding(text)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)
+        x = self.ln_final(x)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1), ...] @ self.text_projection
+        return x
+
+
+class CLIPTextContextEncoder(nn.Module):
+
+    def __init__(self,
+                 context_length=22,
+                 vocab_size=49408,
+                 transformer_width=512,
+                 transformer_heads=8,
+                 transformer_layers=12,
+                 embed_dim=1024,
+                 out_dim=256,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__()
+
+        self.pretrained = pretrained
+
+        self.context_length = context_length
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.embed_dim = embed_dim
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+
+    def init_weights(self, pretrained=None):
+        pretrained = pretrained or self.pretrained
+        if isinstance(pretrained, str):
+            checkpoint = torch.jit.load(
+                pretrained, map_location='cpu').float().state_dict()
+
+            state_dict = {}
+
+            for k in checkpoint.keys():
+                if k.startswith('transformer.'):
+                    state_dict[k] = checkpoint[k]
+
+                if k == 'positional_embedding' or k == 'text_projection' or k.startswith(
+                        'token_embedding') or k.startswith('ln_final'):
+                    if k == 'positional_embedding' and checkpoint[k].size(
+                            0) > self.context_length:
+                        checkpoint[k] = checkpoint[k][:self.context_length]
+                        print('positional_embedding is tuncated from 77 to',
+                              self.context_length)
+                    state_dict[k] = checkpoint[k]
+
+            u, w = self.load_state_dict(state_dict, False)
+            print(u, w, 'are misaligned params in text encoder')
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def forward(self, text, context=None):
+        x_text = self.token_embedding(text)  # n_clas, n_text, C
+        K, N1, C = x_text.shape  # 150类 * 5??? * 512
+        B, N2, C = context.shape  # 1 * 8 * 512
+
+        eos_indx = text.argmax(dim=-1) + N2
+        eos_indx = eos_indx.reshape(1, K).expand(B, K).reshape(-1)
+
+        x_text = x_text.reshape(1, K, N1, C).expand(B, K, N1, C)
+        context = context.reshape(B, 1, N2, C).expand(B, K, N2, C)
+
+        x = torch.cat([x_text[:, :, 0:1], context, x_text[:, :, 1:]],
+                      dim=2).reshape(B * K, N1 + N2, C)
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)
+        x = x[torch.arange(x.shape[0]), eos_indx] @ self.text_projection
+        x = x.reshape(B, K, self.embed_dim)
+        return x
+
+
+class ContextDecoder(nn.Module):
+
+    def __init__(self,
+                 transformer_width=256,
+                 transformer_heads=4,
+                 transformer_layers=6,
+                 visual_dim=1024,
+                 dropout=0.1,
+                 **kwargs):
+        super().__init__()
+
+        self.memory_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim),
+            nn.Linear(visual_dim, transformer_width),
+            nn.LayerNorm(transformer_width),
+        )
+
+        self.text_proj = nn.Sequential(
+            nn.LayerNorm(visual_dim),
+            nn.Linear(visual_dim, transformer_width),
+        )
+
+        self.decoder = nn.ModuleList([
+            TransformerDecoderLayer(transformer_width, transformer_heads,
+                                    dropout) for _ in range(transformer_layers)
+        ])
+
+        self.out_proj = nn.Sequential(
+            nn.LayerNorm(transformer_width),
+            nn.Linear(transformer_width, visual_dim))
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, text, visual):
+        B, N, C = visual.shape
+        visual = self.memory_proj(visual)
+        x = self.text_proj(text)
+
+        for layer in self.decoder:
+            x = layer(x, visual)
+
+        return self.out_proj(x)
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
new file mode 100644
index 00000000..108cb043
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -0,0 +1,217 @@
+""" FPNneck
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from timm.models.layers import drop, drop_path, trunc_normal_
+
+from .common import resize
+
+
+class FPN(nn.Module):
+    """Feature Pyramid Network.
+
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+        self.apply(self._init_weights)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.constant_(m.bias.data, 0)
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_base.py b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
new file mode 100644
index 00000000..e3ae0d54
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_base.py
@@ -0,0 +1,157 @@
+"""
+Base modules are adapted from https://github.com/open-mmlab/mmcv/,
+originally Apache 2.0 License, Copyright (c) 2018-2022 OpenMMLab,
+https://github.com/open-mmlab/mmsegmentation/,
+originally Apache 2.0 License, Copyright (c) 2020-2021 OpenMMLab,
+and adapted from https://github.com/raoyongming/DenseCLIP/,
+originally MIT License, Copyright (c) 2022 Rao, Yongming.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .head_fpn import FPNHead
+from .models import (CLIPTextContextEncoder, CLIPVisionTransformer,
+                     ContextDecoder)
+from .neck_fpn import FPN
+from .utils import SimpleTokenizer, tokenize
+
+
+class SHOPSEG(nn.Module):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+    """
+
+    def __init__(self,
+                 model_dir,
+                 context_length=22,
+                 context_feature='attention',
+                 score_concat_index=2,
+                 tau=0.07,
+                 token_embed_dim=512,
+                 text_dim=512,
+                 **args):
+        super(SHOPSEG, self).__init__()
+
+        self.model_dir = model_dir
+        self.tokenizer = SimpleTokenizer(model_dir
+                                         + '/bpe_simple_vocab_16e6.txt.gz')
+
+        backbone = CLIPVisionTransformer(
+            input_resolution=1024,
+            patch_size=16,
+            width=768,
+            layers=12,
+            output_dim=512,
+            drop_path_rate=0.1,
+            pretrained=False,
+            get_embeddings=True)
+
+        text_encoder = CLIPTextContextEncoder(
+            context_length=30,
+            vocab_size=49408,
+            transformer_width=512,
+            transformer_heads=8,
+            transformer_layers=12,
+            embed_dim=512,
+            pretrained=False)
+
+        context_decoder = ContextDecoder(
+            transformer_width=256,
+            transformer_heads=4,
+            transformer_layers=3,
+            visual_dim=512,
+            dropout=0.1)
+        neck = FPN(
+            in_channels=[768, 768, 768 + 2, 768], out_channels=256, num_outs=4)
+        head_fpd = FPNHead(channels=256, num_classes=2)
+
+        self.backbone = backbone
+        self.text_encoder = text_encoder
+        self.context_decoder = context_decoder
+        self.context_length = context_length
+        self.score_concat_index = score_concat_index
+
+        self.context_feature = context_feature
+        self.tau = tau
+        context_length = self.text_encoder.context_length - self.context_length
+        self.contexts = nn.Parameter(
+            torch.randn(1, context_length, token_embed_dim))
+        nn.init.trunc_normal_(self.contexts)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * 1e-4)
+
+        self.neck = neck
+        self.head_fpn = head_fpd
+
+        self.tau = 0.07
+
+    def encode_text(self, text, context_length):
+        output = tokenize(self.tokenizer, text, context_length, True)
+        return output
+
+    def extract_feat(self, img):
+        """Extract features from images."""
+        x = self.backbone(img)
+        return x
+
+    def after_extract_feat(self, x, name_list):
+        x_orig = list(x[0:4])
+        global_feat, visual_embeddings = x[4]
+        B, C, H, W = visual_embeddings.shape
+        if self.context_feature == 'attention':
+            x1 = global_feat.reshape(B, C, 1)
+            x2 = visual_embeddings.reshape(B, C, H * W)
+            visual_context = torch.cat([x1, x2], dim=2).permute(0, 2, 1)
+        texts = torch.cat([
+            self.encode_text(c, context_length=self.context_length)
+            for c in name_list
+        ])
+        x1 = texts.to(global_feat.device)
+        x1 = self.text_encoder(x1, self.contexts)
+        text_embeddings = x1.expand(B, -1, -1)
+        # update text_embeddings by visual_context!
+        # (B, 1, C)
+        text_diff = self.context_decoder(text_embeddings, visual_context)
+        # (B, K, C)
+        text_embeddings = text_embeddings + self.gamma * text_diff
+
+        # compute score map and concat
+        B, K, C = text_embeddings.shape
+        visual_embeddings = F.normalize(visual_embeddings, dim=1, p=2)
+        text = F.normalize(text_embeddings, dim=2, p=2)
+        score_map_list = []
+        bsz = B
+        for i in range(bsz):
+            ind = 2 * i
+            sub_text = torch.cat(
+                [text[i:i + 1, ind:ind + 1], text[i:i + 1, ind + 1:ind + 2]],
+                dim=1)  # 1 * 2 * h * w
+
+            sub_score_map = torch.einsum('bchw,bkc->bkhw',
+                                         visual_embeddings[i:i + 1],
+                                         sub_text)  # 1 * 2 * h * w
+            score_map_list.append(sub_score_map)
+        score_map = torch.cat(score_map_list, dim=0)  # b * 2 * h * w
+        x_orig[self.score_concat_index] = torch.cat(
+            [x_orig[self.score_concat_index], score_map], dim=1)
+        return x_orig, score_map
+
+    def forward(self, img, text_list=None):
+        if text_list is None:
+            bsz = img.size()[0]
+            text_list = ['foregeound'] * bsz
+        x = self.extract_feat(img)
+        _x_orig = [x[i] for i in range(4)]
+        name_list = []
+        for name in text_list:
+            name_list.append('others')
+            name_list.append(name[0:20])
+        x_orig, score_map = self.after_extract_feat(x, name_list)
+        x_orig = list(self.neck(x_orig))
+        _x_orig = x_orig
+        pred = self.head_fpn(_x_orig)
+        return pred
diff --git a/modelscope/models/cv/shop_segmentation/shop_seg_model.py b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
new file mode 100644
index 00000000..409c583b
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/shop_seg_model.py
@@ -0,0 +1,115 @@
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.shop_segmentation import SHOPSEG
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ShopSegmentation']
+
+
+@MODELS.register_module(
+    Tasks.shop_segmentation, module_name=Models.shop_segmentation)
+class ShopSegmentation(TorchModel):
+    """ shop segmentation model.
+    """
+
+    def __init__(self, model_dir, device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+
+        self.model = SHOPSEG(model_dir=model_dir)
+        pretrained_params = torch.load('{}/{}'.format(
+            model_dir, ModelFile.TORCH_MODEL_BIN_FILE))
+
+        self.model.load_state_dict(pretrained_params)
+        self.model.eval()
+        self.device_id = device_id
+        if self.device_id >= 0 and torch.cuda.is_available():
+            self.model.to('cuda:{}'.format(self.device_id))
+            logger.info('Use GPU: {}'.format(self.device_id))
+        else:
+            self.device_id = -1
+            logger.info('Use CPU for inference')
+
+    def preprocess(self, img, size=1024):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        h, w, c = img.shape
+        max_hw = max(h, w)
+        ratio = 1.0 * size / max_hw
+        crop_h, crop_w = int(ratio * h), int(ratio * w)
+        pil_img = Image.fromarray(img)
+        pil_img = pil_img.resize((crop_w, crop_h), Image.BILINEAR)
+        np_img = np.array(pil_img, dtype=np.float32) / 255.
+
+        for j in range(3):
+            np_img[:, :, j] = (np_img[:, :, j] - mean[j]) / std[j]
+
+        img_pad = np.zeros((size, size, 3), dtype=np.float32)
+        img_pad[:crop_h, :crop_w] = np_img
+
+        img_pad = torch.from_numpy(img_pad).permute(2, 0,
+                                                    1).unsqueeze(0).float()
+        return img_pad, h, w, crop_h, crop_w
+
+    def postprocess(self, tensors, crop_h, crop_w, ori_h, ori_w):
+        output = np.clip(tensors * 255., a_min=0, a_max=255.)
+        crop_output = np.array(output[:crop_h, :crop_w], dtype=np.uint8)
+
+        pil_output = Image.fromarray(crop_output)
+        pil_output = pil_output.resize((ori_w, ori_h), Image.BILINEAR)
+        np_output = np.array(pil_output, dtype=np.uint8)
+
+        np_output[np_output < 128] = 0
+        np_output[np_output >= 128] = 255
+        np_output = np.uint8(np_output)
+        return np_output
+
+    def forward(self, image):
+        """
+        image should be numpy array, dtype=np.uint8, shape: height*width*3
+        """
+        image_tensor, ori_h, ori_w, crop_h, crop_w = self.preprocess(
+            image, size=1024)
+        pred = self.inference(image_tensor)
+        msk = self.postprocess(pred, crop_h, crop_w, ori_h, ori_w, size=1024)
+
+        outputs = {OutputKeys.MASKS: msk}
+        return outputs
+
+    def inference(self, image):
+        """
+        image should be tensor, 1 * 3 * 1024 * 1024
+        """
+        with torch.no_grad():
+            if self.device_id == -1:
+                output = self.model(image)
+            else:
+                device = torch.device('cuda', self.device_id)
+                output = self.model(image.to(device))
+            output = F.interpolate(output, size=(1024, 1024), mode='bilinear')
+            output = F.softmax(output, dim=1)
+            output = torch.argmax(output, dim=1)
+            output = output[0]
+            if self.device_id == -1:
+                pred = output.data.numpy()
+            else:
+                pred = output.data.cpu().numpy()
+
+            del output
+        return pred
diff --git a/modelscope/models/cv/shop_segmentation/utils.py b/modelscope/models/cv/shop_segmentation/utils.py
new file mode 100644
index 00000000..c41f8a65
--- /dev/null
+++ b/modelscope/models/cv/shop_segmentation/utils.py
@@ -0,0 +1,199 @@
+""" CLIP Tokenizer
+Adapted from https://github.com/openai/CLIP.
+Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import Any, List, Union
+
+import ftfy
+import regex as re
+import torch
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        error_list = []
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception as err:
+                    error_list.append(err)
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+def tokenize(tokenizer,
+             texts,
+             context_length: int = 77,
+             truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = tokenizer.encoder['<|startoftext|>']
+    eot_token = tokenizer.encoder['<|endoftext|>']
+    all_tokens = [[sot_token] + tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(
+                    f'Input {texts[i]} is too long for context length {context_length}'
+                )
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index e84c8dcc..8fe71ec2 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -259,7 +259,13 @@ TASK_OUTPUTS = {
     #       ]
     #   }
     Tasks.text_driven_segmentation: [OutputKeys.MASKS],
-
+    # shop segmentation result for single sample
+    #   {
+    #       "masks": [
+    #           np.array # 2D array containing only 0, 255
+    #       ]
+    #   }
+    Tasks.shop_segmentation: [OutputKeys.MASKS],
     # movide scene segmentation result for a single video
     # {
     #        "split_video_num":3,
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f43d152b..f6381857 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -156,7 +156,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_vitl16_segmentation_text-driven-seg'),
     Tasks.movie_scene_segmentation:
     (Pipelines.movie_scene_segmentation,
-     'damo/cv_resnet50-bert_video-scene-segmentation_movienet')
+     'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
+    Tasks.shop_segmentation: (Pipelines.shop_segmentation,
+                              'damo/cv_vitb16_segmentation_shop-seg'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 9e7d80ee..d3dba978 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -43,10 +43,10 @@ if TYPE_CHECKING:
     from .tinynas_classification_pipeline import TinynasClassificationPipeline
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
+    from .shop_segmentation_pipleline import ShopSegmentationPipeline
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
-
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -96,6 +96,7 @@ else:
         'tinynas_classification_pipeline': ['TinynasClassificationPipeline'],
         'video_category_pipeline': ['VideoCategoryPipeline'],
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
+        'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
         'easycv_pipeline': [
             'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
             'Face2DKeypointsPipeline'
diff --git a/modelscope/pipelines/cv/shop_segmentation_pipleline.py b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
new file mode 100644
index 00000000..b7fd90b4
--- /dev/null
+++ b/modelscope/pipelines/cv/shop_segmentation_pipleline.py
@@ -0,0 +1,51 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.shop_segmentation, module_name=Pipelines.shop_segmentation)
+class ShopSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img_tensor, ori_h, ori_w, crop_h, crop_w = self.model.preprocess(img)
+        result = {
+            'img': img_tensor,
+            'ori_h': ori_h,
+            'ori_w': ori_w,
+            'crop_h': crop_h,
+            'crop_w': crop_w
+        }
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {
+            'data': outputs,
+            'ori_h': input['ori_h'],
+            'ori_w': input['ori_w'],
+            'crop_h': input['crop_h'],
+            'crop_w': input['crop_w'],
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        data = self.model.postprocess(inputs['data'], inputs['crop_h'],
+                                      inputs['crop_w'], inputs['ori_h'],
+                                      inputs['ori_w'])
+        outputs = {OutputKeys.MASKS: data}
+        return outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 86808ea1..1b738bfe 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -38,6 +38,7 @@ class CVTasks(object):
     image_segmentation = 'image-segmentation'
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
+    shop_segmentation = 'shop-segmentation'
 
     # image editing
     skin_retouching = 'skin-retouching'
diff --git a/tests/pipelines/test_shop_segmentation.py b/tests/pipelines/test_shop_segmentation.py
new file mode 100644
index 00000000..58c56dd7
--- /dev/null
+++ b/tests/pipelines/test_shop_segmentation.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ShopSegmentationTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_shop_segmentation(self):
+        input_location = 'data/test/images/shop_segmentation.jpg'
+        model_id = 'damo/cv_vitb16_segmentation_shop-seg'
+        shop_seg = pipeline(Tasks.shop_segmentation, model=model_id)
+        result = shop_seg(input_location)
+        import cv2
+        # result[OutputKeys.MASKS] is segment map result,other keys are not used
+        cv2.imwrite(input_location + '_shopseg.jpg', result[OutputKeys.MASKS])
+
+
+if __name__ == '__main__':
+    unittest.main()

From f508be89183cc2d9047bbb6fcbe23685a239959d Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Sat, 3 Sep 2022 23:48:42 +0800
Subject: [PATCH 16/28] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EReti?=
 =?UTF-8?q?naFace=E4=BA=BA=E8=84=B8=E6=A3=80=E6=B5=8B=E5=99=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 新增人脸检测RetinaFace模型；
2. 完成Maas-cv CR标准自查
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9945188
---
 data/test/images/retina_face_detection.jpg    |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/face_detection/retinaface/__init__.py  |   0
 .../cv/face_detection/retinaface/detection.py | 137 ++++++++++++++++
 .../retinaface/models/__init__.py             |   0
 .../face_detection/retinaface/models/net.py   | 149 ++++++++++++++++++
 .../retinaface/models/retinaface.py           | 145 +++++++++++++++++
 .../cv/face_detection/retinaface/utils.py     | 123 +++++++++++++++
 modelscope/pipelines/base.py                  |   1 -
 .../cv/retina_face_detection_pipeline.py      |  55 +++++++
 tests/pipelines/test_retina_face_detection.py |  33 ++++
 11 files changed, 647 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/retina_face_detection.jpg
 create mode 100644 modelscope/models/cv/face_detection/retinaface/__init__.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/detection.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/models/__init__.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/models/net.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/models/retinaface.py
 create mode 100755 modelscope/models/cv/face_detection/retinaface/utils.py
 create mode 100644 modelscope/pipelines/cv/retina_face_detection_pipeline.py
 create mode 100644 tests/pipelines/test_retina_face_detection.py

diff --git a/data/test/images/retina_face_detection.jpg b/data/test/images/retina_face_detection.jpg
new file mode 100644
index 00000000..c95881fe
--- /dev/null
+++ b/data/test/images/retina_face_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9
+size 87228
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index b1bf9600..9638268c 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
 
     # EasyCV models
@@ -118,6 +119,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    retina_face_detection = 'resnet50-face-detection-retinaface'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
     daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/retinaface/detection.py b/modelscope/models/cv/face_detection/retinaface/detection.py
new file mode 100755
index 00000000..3dd31659
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/detection.py
@@ -0,0 +1,137 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .models.retinaface import RetinaFace
+from .utils import PriorBox, decode, decode_landm, py_cpu_nms
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.retinaface)
+class RetinaFaceDetection(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.cfg = Config.from_file(
+            model_path.replace(ModelFile.TORCH_MODEL_FILE,
+                               ModelFile.CONFIGURATION))['models']
+        self.net = RetinaFace(cfg=self.cfg)
+        self.load_model()
+        self.device = device
+        self.net = self.net.to(self.device)
+
+        self.mean = torch.tensor([[[[104]], [[117]], [[123]]]]).to(device)
+
+    def check_keys(self, pretrained_state_dict):
+        ckpt_keys = set(pretrained_state_dict.keys())
+        model_keys = set(self.net.state_dict().keys())
+        used_pretrained_keys = model_keys & ckpt_keys
+        assert len(
+            used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
+        return True
+
+    def remove_prefix(self, state_dict, prefix):
+        new_state_dict = dict()
+        for k, v in state_dict.items():
+            if k.startswith(prefix):
+                new_state_dict[k[len(prefix):]] = v
+            else:
+                new_state_dict[k] = v
+        return new_state_dict
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        if 'state_dict' in pretrained_dict.keys():
+            pretrained_dict = self.remove_prefix(pretrained_dict['state_dict'],
+                                                 'module.')
+        else:
+            pretrained_dict = self.remove_prefix(pretrained_dict, 'module.')
+        self.check_keys(pretrained_dict)
+        self.net.load_state_dict(pretrained_dict, strict=False)
+        self.net.eval()
+
+    def forward(self, input):
+        img_raw = input['img'].cpu().numpy()
+        img = np.float32(img_raw)
+
+        im_height, im_width = img.shape[:2]
+        ss = 1.0
+        # tricky
+        if max(im_height, im_width) > 1500:
+            ss = 1000.0 / max(im_height, im_width)
+            img = cv2.resize(img, (0, 0), fx=ss, fy=ss)
+            im_height, im_width = img.shape[:2]
+
+        scale = torch.Tensor(
+            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
+        img -= (104, 117, 123)
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.to(self.device)
+        scale = scale.to(self.device)
+
+        loc, conf, landms = self.net(img)  # forward pass
+        del img
+
+        confidence_threshold = 0.9
+        nms_threshold = 0.4
+        top_k = 5000
+        keep_top_k = 750
+
+        priorbox = PriorBox(self.cfg, image_size=(im_height, im_width))
+        priors = priorbox.forward()
+        priors = priors.to(self.device)
+        prior_data = priors.data
+        boxes = decode(loc.data.squeeze(0), prior_data, self.cfg['variance'])
+        boxes = boxes * scale
+        boxes = boxes.cpu().numpy()
+        scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
+        landms = decode_landm(
+            landms.data.squeeze(0), prior_data, self.cfg['variance'])
+        scale1 = torch.Tensor([
+            im_width, im_height, im_width, im_height, im_width, im_height,
+            im_width, im_height, im_width, im_height
+        ])
+        scale1 = scale1.to(self.device)
+        landms = landms * scale1
+        landms = landms.cpu().numpy()
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        landms = landms[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1][:top_k]
+        boxes = boxes[order]
+        landms = landms[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
+            np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+        landms = landms[keep]
+
+        # keep top-K faster NMS
+        dets = dets[:keep_top_k, :]
+        landms = landms[:keep_top_k, :]
+
+        landms = landms.reshape((-1, 5, 2))
+        landms = landms.reshape(
+            -1,
+            10,
+        )
+        return dets / ss, landms / ss
diff --git a/modelscope/models/cv/face_detection/retinaface/models/__init__.py b/modelscope/models/cv/face_detection/retinaface/models/__init__.py
new file mode 100755
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/retinaface/models/net.py b/modelscope/models/cv/face_detection/retinaface/models/net.py
new file mode 100755
index 00000000..3be7c4b9
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/models/net.py
@@ -0,0 +1,149 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+import time
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.models._utils as _utils
+from torch.autograd import Variable
+
+
+def conv_bn(inp, oup, stride=1, leaky=0):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True))
+
+
+def conv_bn_no_relu(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+    )
+
+
+def conv_bn1X1(inp, oup, stride, leaky=0):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
+        nn.BatchNorm2d(oup), nn.LeakyReLU(negative_slope=leaky, inplace=True))
+
+
+def conv_dw(inp, oup, stride, leaky=0.1):
+    return nn.Sequential(
+        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+        nn.BatchNorm2d(inp),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True),
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.LeakyReLU(negative_slope=leaky, inplace=True),
+    )
+
+
+class SSH(nn.Module):
+
+    def __init__(self, in_channel, out_channel):
+        super(SSH, self).__init__()
+        assert out_channel % 4 == 0
+        leaky = 0
+        if (out_channel <= 64):
+            leaky = 0.1
+        self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1)
+
+        self.conv5X5_1 = conv_bn(
+            in_channel, out_channel // 4, stride=1, leaky=leaky)
+        self.conv5X5_2 = conv_bn_no_relu(
+            out_channel // 4, out_channel // 4, stride=1)
+
+        self.conv7X7_2 = conv_bn(
+            out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
+        self.conv7x7_3 = conv_bn_no_relu(
+            out_channel // 4, out_channel // 4, stride=1)
+
+    def forward(self, input):
+        conv3X3 = self.conv3X3(input)
+
+        conv5X5_1 = self.conv5X5_1(input)
+        conv5X5 = self.conv5X5_2(conv5X5_1)
+
+        conv7X7_2 = self.conv7X7_2(conv5X5_1)
+        conv7X7 = self.conv7x7_3(conv7X7_2)
+
+        out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
+        out = F.relu(out)
+        return out
+
+
+class FPN(nn.Module):
+
+    def __init__(self, in_channels_list, out_channels):
+        super(FPN, self).__init__()
+        leaky = 0
+        if (out_channels <= 64):
+            leaky = 0.1
+        self.output1 = conv_bn1X1(
+            in_channels_list[0], out_channels, stride=1, leaky=leaky)
+        self.output2 = conv_bn1X1(
+            in_channels_list[1], out_channels, stride=1, leaky=leaky)
+        self.output3 = conv_bn1X1(
+            in_channels_list[2], out_channels, stride=1, leaky=leaky)
+
+        self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
+        self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)
+
+    def forward(self, input):
+        # names = list(input.keys())
+        input = list(input.values())
+
+        output1 = self.output1(input[0])
+        output2 = self.output2(input[1])
+        output3 = self.output3(input[2])
+
+        up3 = F.interpolate(
+            output3, size=[output2.size(2), output2.size(3)], mode='nearest')
+        output2 = output2 + up3
+        output2 = self.merge2(output2)
+
+        up2 = F.interpolate(
+            output2, size=[output1.size(2), output1.size(3)], mode='nearest')
+        output1 = output1 + up2
+        output1 = self.merge1(output1)
+
+        out = [output1, output2, output3]
+        return out
+
+
+class MobileNetV1(nn.Module):
+
+    def __init__(self):
+        super(MobileNetV1, self).__init__()
+        self.stage1 = nn.Sequential(
+            conv_bn(3, 8, 2, leaky=0.1),  # 3
+            conv_dw(8, 16, 1),  # 7
+            conv_dw(16, 32, 2),  # 11
+            conv_dw(32, 32, 1),  # 19
+            conv_dw(32, 64, 2),  # 27
+            conv_dw(64, 64, 1),  # 43
+        )
+        self.stage2 = nn.Sequential(
+            conv_dw(64, 128, 2),  # 43 + 16 = 59
+            conv_dw(128, 128, 1),  # 59 + 32 = 91
+            conv_dw(128, 128, 1),  # 91 + 32 = 123
+            conv_dw(128, 128, 1),  # 123 + 32 = 155
+            conv_dw(128, 128, 1),  # 155 + 32 = 187
+            conv_dw(128, 128, 1),  # 187 + 32 = 219
+        )
+        self.stage3 = nn.Sequential(
+            conv_dw(128, 256, 2),  # 219 +3 2 = 241
+            conv_dw(256, 256, 1),  # 241 + 64 = 301
+        )
+        self.avg = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(256, 1000)
+
+    def forward(self, x):
+        x = self.stage1(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.avg(x)
+        x = x.view(-1, 256)
+        x = self.fc(x)
+        return x
diff --git a/modelscope/models/cv/face_detection/retinaface/models/retinaface.py b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
new file mode 100755
index 00000000..8d2001dd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/models/retinaface.py
@@ -0,0 +1,145 @@
+# The implementation is based on resnet, available at https://github.com/biubug6/Pytorch_Retinaface
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+import torchvision.models._utils as _utils
+import torchvision.models.detection.backbone_utils as backbone_utils
+
+from .net import FPN, SSH, MobileNetV1
+
+
+class ClassHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(ClassHead, self).__init__()
+        self.num_anchors = num_anchors
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            self.num_anchors * 2,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 2)
+
+
+class BboxHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(BboxHead, self).__init__()
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            num_anchors * 4,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 4)
+
+
+class LandmarkHead(nn.Module):
+
+    def __init__(self, inchannels=512, num_anchors=3):
+        super(LandmarkHead, self).__init__()
+        self.conv1x1 = nn.Conv2d(
+            inchannels,
+            num_anchors * 10,
+            kernel_size=(1, 1),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        out = self.conv1x1(x)
+        out = out.permute(0, 2, 3, 1).contiguous()
+
+        return out.view(out.shape[0], -1, 10)
+
+
+class RetinaFace(nn.Module):
+
+    def __init__(self, cfg=None):
+        """
+        :param cfg:  Network related settings.
+        """
+        super(RetinaFace, self).__init__()
+        backbone = None
+        if cfg['name'] == 'Resnet50':
+            backbone = models.resnet50(pretrained=cfg['pretrain'])
+        else:
+            raise Exception('Invalid name')
+
+        self.body = _utils.IntermediateLayerGetter(backbone,
+                                                   cfg['return_layers'])
+        in_channels_stage2 = cfg['in_channel']
+        in_channels_list = [
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ]
+        out_channels = cfg['out_channel']
+        self.fpn = FPN(in_channels_list, out_channels)
+        self.ssh1 = SSH(out_channels, out_channels)
+        self.ssh2 = SSH(out_channels, out_channels)
+        self.ssh3 = SSH(out_channels, out_channels)
+
+        self.ClassHead = self._make_class_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+        self.BboxHead = self._make_bbox_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+        self.LandmarkHead = self._make_landmark_head(
+            fpn_num=3, inchannels=cfg['out_channel'])
+
+    def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        classhead = nn.ModuleList()
+        for i in range(fpn_num):
+            classhead.append(ClassHead(inchannels, anchor_num))
+        return classhead
+
+    def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        bboxhead = nn.ModuleList()
+        for i in range(fpn_num):
+            bboxhead.append(BboxHead(inchannels, anchor_num))
+        return bboxhead
+
+    def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
+        landmarkhead = nn.ModuleList()
+        for i in range(fpn_num):
+            landmarkhead.append(LandmarkHead(inchannels, anchor_num))
+        return landmarkhead
+
+    def forward(self, inputs):
+        out = self.body(inputs)
+
+        # FPN
+        fpn = self.fpn(out)
+
+        # SSH
+        feature1 = self.ssh1(fpn[0])
+        feature2 = self.ssh2(fpn[1])
+        feature3 = self.ssh3(fpn[2])
+        features = [feature1, feature2, feature3]
+
+        bbox_regressions = torch.cat(
+            [self.BboxHead[i](feature) for i, feature in enumerate(features)],
+            dim=1)
+        classifications = torch.cat(
+            [self.ClassHead[i](feature) for i, feature in enumerate(features)],
+            dim=1)
+        ldm_regressions = torch.cat(
+            [self.LandmarkHead[i](feat) for i, feat in enumerate(features)],
+            dim=1)
+
+        output = (bbox_regressions, F.softmax(classifications,
+                                              dim=-1), ldm_regressions)
+        return output
diff --git a/modelscope/models/cv/face_detection/retinaface/utils.py b/modelscope/models/cv/face_detection/retinaface/utils.py
new file mode 100755
index 00000000..60c9e2dd
--- /dev/null
+++ b/modelscope/models/cv/face_detection/retinaface/utils.py
@@ -0,0 +1,123 @@
+# --------------------------------------------------------
+# Modified from https://github.com/biubug6/Pytorch_Retinaface
+# --------------------------------------------------------
+
+from itertools import product as product
+from math import ceil
+
+import numpy as np
+import torch
+
+
+class PriorBox(object):
+
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[
+            ceil(self.image_size[0] / step),
+            ceil(self.image_size[1] / step)
+        ] for step in self.steps]
+        self.name = 's'
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [
+                        x * self.steps[k] / self.image_size[1]
+                        for x in [j + 0.5]
+                    ]
+                    dense_cy = [
+                        y * self.steps[k] / self.image_size[0]
+                        for y in [i + 0.5]
+                    ]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
+
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat(
+        (priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    a = priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:]
+    b = priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:]
+    c = priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:]
+    d = priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:]
+    e = priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:]
+    landms = torch.cat((a, b, c, d, e), dim=1)
+    return landms
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index c0f3cbd0..d4f9c6bf 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -2,7 +2,6 @@
 
 import os.path as osp
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
 from threading import Lock
 from typing import Any, Dict, Generator, List, Mapping, Union
 
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
new file mode 100644
index 00000000..20111c11
--- /dev/null
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -0,0 +1,55 @@
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection.retinaface import detection
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_detection, module_name=Pipelines.retina_face_detection)
+class RetinaFaceDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        detector = detection.RetinaFaceDetection(
+            model_path=ckpt_path, device=self.device)
+        self.detector = detector
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float32)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(input)
+        assert result is not None
+        bboxes = result[0][:, :4].tolist()
+        scores = result[0][:, 4].tolist()
+        lms = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: lms,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/tests/pipelines/test_retina_face_detection.py b/tests/pipelines/test_retina_face_detection.py
new file mode 100644
index 00000000..343e1c91
--- /dev/null
+++ b/tests/pipelines/test_retina_face_detection.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_result
+from modelscope.utils.test_utils import test_level
+
+
+class RetinaFaceDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet50_face-detection_retinaface'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        img_path = 'data/test/images/retina_face_detection.jpg'
+
+        result = face_detection(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From adab7d3391c636818372697edc48dffb5f2d25d4 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Mon, 5 Sep 2022 09:53:58 +0800
Subject: [PATCH 17/28] =?UTF-8?q?[to=20#42322933]=20=E6=96=B0=E5=A2=9EFER?=
 =?UTF-8?q?=E4=BA=BA=E8=84=B8=E5=B1=9E=E6=80=A7=E8=AF=86=E5=88=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

完成Maas-cv CR自查；
新增个Task，已经跟产品确认可以增加，正在走流程中，目前还不在https://aone.alibaba-inc.com/v2/project/1181559/req#viewIdentifier=d7f112f9d023e2108fa1b0d8这里，后续会增加过来
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9976346
---
 .../images/facial_expression_recognition.jpg  |   3 +
 modelscope/metainfo.py                        |   2 +
 .../facial_expression_recognition/__init__.py |   0
 .../fer/__init__.py                           |   0
 .../fer/facial_expression_recognition.py      |  72 ++++++++++
 .../fer/transforms.py                         | 118 ++++++++++++++++
 .../facial_expression_recognition/fer/vgg.py  |  40 ++++++
 modelscope/outputs.py                         |   8 ++
 modelscope/pipelines/builder.py               |   3 +
 .../facial_expression_recognition_pipeline.py | 128 ++++++++++++++++++
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/cv/image_utils.py            |  20 +++
 .../test_facial_expression_recognition.py     |  36 +++++
 13 files changed, 431 insertions(+)
 create mode 100644 data/test/images/facial_expression_recognition.jpg
 create mode 100644 modelscope/models/cv/facial_expression_recognition/__init__.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/__init__.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/transforms.py
 create mode 100644 modelscope/models/cv/facial_expression_recognition/fer/vgg.py
 create mode 100644 modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
 create mode 100644 tests/pipelines/test_facial_expression_recognition.py

diff --git a/data/test/images/facial_expression_recognition.jpg b/data/test/images/facial_expression_recognition.jpg
new file mode 100644
index 00000000..a943fa72
--- /dev/null
+++ b/data/test/images/facial_expression_recognition.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdb1cef5a5fd5f938a856311011c4820ddc45946a470b9929c61e59b6a065633
+size 161535
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 9638268c..47608d02 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -32,6 +32,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
 
@@ -119,6 +120,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
     live_category = 'live-category'
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
new file mode 100644
index 00000000..c5eb71a1
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/facial_expression_recognition.py
@@ -0,0 +1,72 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+from PIL import Image
+from torch.autograd import Variable
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from . import transforms
+from .vgg import VGG
+
+
+@MODELS.register_module(
+    Tasks.facial_expression_recognition, module_name=Models.fer)
+class FacialExpressionRecognition(TorchModel):
+
+    def __init__(self, model_path, device='cuda'):
+        super().__init__(model_path)
+        torch.set_grad_enabled(False)
+        cudnn.benchmark = True
+        self.model_path = model_path
+        self.device = device
+        self.cfg_path = model_path.replace(ModelFile.TORCH_MODEL_FILE,
+                                           ModelFile.CONFIGURATION)
+        self.net = VGG('VGG19', cfg_path=self.cfg_path)
+        self.load_model()
+        self.net = self.net.to(device)
+        self.transform_test = transforms.Compose([
+            transforms.TenCrop(44),
+            transforms.Lambda(lambda crops: torch.stack(
+                [transforms.ToTensor()(crop) for crop in crops])),
+        ])
+
+        self.mean = np.array([[104, 117, 123]])
+
+    def load_model(self, load_to_cpu=False):
+        pretrained_dict = torch.load(
+            self.model_path, map_location=torch.device('cpu'))
+        self.net.load_state_dict(pretrained_dict['net'], strict=True)
+        self.net.eval()
+
+    def forward(self, input):
+        img = input['img']
+        img = cv2.cvtColor(img.cpu().numpy(), cv2.COLOR_BGR2GRAY)
+        img = cv2.resize(img, (48, 48))
+        img = img[:, :, np.newaxis]
+        img = np.concatenate((img, img, img), axis=2)
+
+        img = Image.fromarray(np.uint8(img))
+        inputs = self.transform_test(img)
+
+        ncrops, c, h, w = inputs.shape
+
+        inputs = inputs.view(-1, c, h, w)
+        inputs = inputs.to(self.device)
+        inputs = Variable(inputs, volatile=True)
+        outputs = self.net(inputs)
+
+        outputs_avg = outputs.view(ncrops, -1).mean(0)  # avg over crops
+
+        score = F.softmax(outputs_avg)
+        _, predicted = torch.max(outputs_avg.data, 0)
+
+        return score, predicted
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/transforms.py b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
new file mode 100644
index 00000000..a1448c49
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/transforms.py
@@ -0,0 +1,118 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import numbers
+import types
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def to_tensor(pic):
+
+    # handle PIL Image
+    if pic.mode == 'I':
+        img = torch.from_numpy(np.array(pic, np.int32, copy=False))
+    elif pic.mode == 'I;16':
+        img = torch.from_numpy(np.array(pic, np.int16, copy=False))
+    else:
+        img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
+    # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
+    if pic.mode == 'YCbCr':
+        nchannel = 3
+    elif pic.mode == 'I;16':
+        nchannel = 1
+    else:
+        nchannel = len(pic.mode)
+    img = img.view(pic.size[1], pic.size[0], nchannel)
+    # put it from HWC to CHW format
+    # yikes, this transpose takes 80% of the loading time/CPU
+    img = img.transpose(0, 1).transpose(0, 2).contiguous()
+    if isinstance(img, torch.ByteTensor):
+        return img.float().div(255)
+    else:
+        return img
+
+
+def center_crop(img, output_size):
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    w, h = img.size
+    th, tw = output_size
+    i = int(round((h - th) / 2.))
+    j = int(round((w - tw) / 2.))
+    return img.crop((j, i, j + tw, i + th))
+
+
+def five_crop(img, size):
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(
+            size) == 2, 'Please provide only two dimensions (h, w) for size.'
+
+    w, h = img.size
+    crop_h, crop_w = size
+    if crop_w > w or crop_h > h:
+        raise ValueError(
+            'Requested crop size {} is bigger than input size {}'.format(
+                size, (h, w)))
+    tl = img.crop((0, 0, crop_w, crop_h))
+    tr = img.crop((w - crop_w, 0, w, crop_h))
+    bl = img.crop((0, h - crop_h, crop_w, h))
+    br = img.crop((w - crop_w, h - crop_h, w, h))
+    center = center_crop(img, (crop_h, crop_w))
+    return (tl, tr, bl, br, center)
+
+
+class TenCrop(object):
+
+    def __init__(self, size, vertical_flip=False):
+        self.size = size
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            assert len(
+                size
+            ) == 2, 'Please provide only two dimensions (h, w) for size.'
+            self.size = size
+        self.vertical_flip = vertical_flip
+
+    def __call__(self, img):
+        first_five = five_crop(img, self.size)
+
+        if self.vertical_flip:
+            img = img.transpose(Image.FLIP_TOP_BOTTOM)
+        else:
+            img = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+        second_five = five_crop(img, self.size)
+
+        return first_five + second_five
+
+
+class Compose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+
+class ToTensor(object):
+
+    def __call__(self, pic):
+        return to_tensor(pic)
+
+
+class Lambda(object):
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img):
+        return self.lambd(img)
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/vgg.py b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
new file mode 100644
index 00000000..8120b6cc
--- /dev/null
+++ b/modelscope/models/cv/facial_expression_recognition/fer/vgg.py
@@ -0,0 +1,40 @@
+# The implementation is based on Facial-Expression-Recognition, available at
+# https://github.com/WuJie1010/Facial-Expression-Recognition.Pytorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from modelscope.utils.config import Config
+
+
+class VGG(nn.Module):
+
+    def __init__(self, vgg_name, cfg_path):
+        super(VGG, self).__init__()
+        model_cfg = Config.from_file(cfg_path)['models']
+        self.features = self._make_layers(model_cfg[vgg_name])
+        self.classifier = nn.Linear(512, 7)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = F.dropout(out, p=0.5, training=self.training)
+        out = self.classifier(out)
+        return out
+
+    def _make_layers(self, cfg):
+        layers = []
+        in_channels = 3
+        for x in cfg:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [
+                    nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
+                    nn.BatchNorm2d(x),
+                    nn.ReLU(inplace=True)
+                ]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 8fe71ec2..50668693 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -85,6 +85,14 @@ TASK_OUTPUTS = {
     Tasks.face_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # facial expression recognition result for single sample
+    #   {
+    #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
+    #       "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
+    #   }
+    Tasks.facial_expression_recognition:
+    [OutputKeys.SCORES, OutputKeys.LABELS],
+
     # face recognition result for single sample
     #   {
     #       "img_embedding": np.array with shape [1, D],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f6381857..6f901154 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -103,6 +103,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
+    Tasks.facial_expression_recognition:
+    (Pipelines.facial_expression_recognition,
+     'damo/cv_vgg19_facial-expression-recognition_fer'),
     Tasks.face_2d_keypoints: (Pipelines.face_2d_keypoints,
                               'damo/cv_mobilenet_face-2d-keypoints_alignment'),
     Tasks.video_multi_modal_embedding:
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
new file mode 100644
index 00000000..4a80878c
--- /dev/null
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -0,0 +1,128 @@
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_expression_recognition.fer.facial_expression_recognition import \
+    FacialExpressionRecognition
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.facial_expression_recognition,
+    module_name=Pipelines.facial_expression_recognition)
+class FacialExpressionRecognitionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        fer = FacialExpressionRecognition(model_path=ckpt_path, device=device)
+        self.fer = fer
+        self.device = device
+        logger.info('load model done')
+
+        # face detect pipeline
+        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+        self.face_detection = pipeline(
+            Tasks.face_detection, model=det_model_id)
+
+    def _choose_face(self,
+                     det_result,
+                     min_face=10,
+                     top_face=1,
+                     center_face=False):
+        '''
+        choose face with maximum area
+        Args:
+            det_result: output of face detection pipeline
+            min_face: minimum size of valid face w/h
+            top_face: take faces with top max areas
+            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
+        '''
+        bboxes = np.array(det_result[OutputKeys.BOXES])
+        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
+        if bboxes.shape[0] == 0:
+            logger.info('Warning: No face detected!')
+            return None
+        # face idx with enough size
+        face_idx = []
+        for i in range(bboxes.shape[0]):
+            box = bboxes[i]
+            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
+                face_idx += [i]
+        if len(face_idx) == 0:
+            logger.info(
+                f'Warning: Face size not enough, less than {min_face}x{min_face}!'
+            )
+            return None
+        bboxes = bboxes[face_idx]
+        landmarks = landmarks[face_idx]
+        # find max faces
+        boxes = np.array(bboxes)
+        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        sort_idx = np.argsort(area)[-top_face:]
+        # find center face
+        if top_face > 1 and center_face and bboxes.shape[0] > 1:
+            img_center = [img.shape[1] // 2, img.shape[0] // 2]
+            min_dist = float('inf')
+            sel_idx = -1
+            for _idx in sort_idx:
+                box = boxes[_idx]
+                dist = np.square(
+                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
+                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
+                if dist < min_dist:
+                    min_dist = dist
+                    sel_idx = _idx
+            sort_idx = [sel_idx]
+        main_idx = sort_idx[-1]
+        return bboxes[main_idx], landmarks[main_idx]
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        img = img[:, :, ::-1]
+        det_result = self.face_detection(img.copy())
+        rtn = self._choose_face(det_result)
+        face_img = None
+        if rtn is not None:
+            _, face_lmks = rtn
+            face_lmks = face_lmks.reshape(5, 2)
+            face_img, _ = align_face(img, (112, 112), face_lmks)
+            face_img = face_img.astype(np.float32)
+        result = {}
+        result['img'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.fer(input)
+        assert result is not None
+        scores = result[0].tolist()
+        labels = result[1].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 1b738bfe..32185fb9 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -20,6 +20,7 @@ class CVTasks(object):
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
     face_recognition = 'face-recognition'
+    facial_expression_recognition = 'facial-expression-recognition'
     face_2d_keypoints = 'face-2d-keypoints'
     human_detection = 'human-detection'
     human_object_interaction = 'human-object-interaction'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index ea1d95b5..cb07ba1a 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -89,6 +89,26 @@ def draw_keypoints(output, original_image):
     return image
 
 
+def draw_facial_expression_result(img_path, facial_expression_result):
+    label_idx = facial_expression_result[OutputKeys.LABELS]
+    map_list = [
+        'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
+    ]
+    label = map_list[label_idx]
+
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+    cv2.putText(
+        img,
+        'facial expression: {}'.format(label), (10, 10),
+        1,
+        1.0, (0, 255, 0),
+        thickness=1,
+        lineType=8)
+    print('facial expression: {}'.format(label))
+    return img
+
+
 def draw_face_detection_result(img_path, detection_result):
     bboxes = np.array(detection_result[OutputKeys.BOXES])
     kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
diff --git a/tests/pipelines/test_facial_expression_recognition.py b/tests/pipelines/test_facial_expression_recognition.py
new file mode 100644
index 00000000..fff83ad6
--- /dev/null
+++ b/tests/pipelines/test_facial_expression_recognition.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_facial_expression_result
+from modelscope.utils.test_utils import test_level
+
+
+class FacialExpressionRecognitionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_vgg19_facial-expression-recognition_fer'
+
+    def show_result(self, img_path, facial_expression_result):
+        img = draw_facial_expression_result(img_path, facial_expression_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        fer = pipeline(
+            Tasks.facial_expression_recognition, model=self.model_id)
+        img_path = 'data/test/images/facial_expression_recognition.jpg'
+        result = fer(img_path)
+        self.show_result(img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3e92dac3283839fef9e9e9adbc1a9c7edbe5c714 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 5 Sep 2022 09:55:26 +0800
Subject: [PATCH 18/28] [to #42322933]lazy load activate for shop segmentation 
        Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10009052

---
 .../models/cv/shop_segmentation/__init__.py   | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/cv/shop_segmentation/__init__.py b/modelscope/models/cv/shop_segmentation/__init__.py
index b40a0760..072628bd 100644
--- a/modelscope/models/cv/shop_segmentation/__init__.py
+++ b/modelscope/models/cv/shop_segmentation/__init__.py
@@ -1 +1,20 @@
-from .shop_seg_base import SHOPSEG
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .shop_seg_base import SHOPSEG
+
+else:
+    _import_structure = {'shop_seg_base': ['SHOPSEG']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )

From a9c14e4eadd64e30820b689b47f5e2ebc19516f4 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 5 Sep 2022 11:07:48 +0800
Subject: [PATCH 19/28] [to #42322933] Support saving the best checkpoint for
 inference

1. Support saving the best checkpoint for inference
2. Fix a bug that _max_iters field does not exist in trainer
3. Fix a bug that function in lambda_lr field cannot be saved to file
4. Fix a bug that save_pretrained would not be called by iterating
5. Fix a bug that interval is not passed from BestCkptHook's init
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9972765
---
 modelscope/trainers/hooks/checkpoint_hook.py  | 44 ++++++++++---------
 modelscope/trainers/hooks/hook.py             |  4 +-
 modelscope/utils/checkpoint.py                | 17 ++++---
 modelscope/utils/config.py                    |  3 ++
 .../trainers/test_finetune_text_generation.py | 22 +++++-----
 5 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index cf7a0f7a..fcd8e982 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -27,7 +27,7 @@ class CheckpointHook(Hook):
         save_last (bool): Whether to save the last checkpoint. Default: True.
     """
 
-    PRIORITY = Priority.NORMAL
+    PRIORITY = Priority.LOW
 
     def __init__(self,
                  interval=0,
@@ -75,25 +75,27 @@ class CheckpointHook(Hook):
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
         save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
-        self._save_pretrained(trainer)
+        if (self.is_last_epoch(trainer)
+                and self.by_epoch) or (self.is_last_iter(trainer)
+                                       and not self.by_epoch):
+            self._save_pretrained(trainer)
 
     def _save_pretrained(self, trainer):
-        if self.is_last_epoch(trainer) and self.by_epoch:
-            output_dir = os.path.join(self.save_dir,
-                                      ModelFile.TRAIN_OUTPUT_DIR)
-            from modelscope.trainers.parallel.utils import is_parallel
-
-            if is_parallel(trainer.model):
-                model = trainer.model.module
-            else:
-                model = trainer.model
-
-            if hasattr(model, 'save_pretrained'):
-                model.save_pretrained(
-                    output_dir,
-                    ModelFile.TORCH_MODEL_BIN_FILE,
-                    save_function=save_checkpoint,
-                    config=trainer.cfg.to_dict())
+        output_dir = os.path.join(self.save_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        from modelscope.trainers.parallel.utils import is_parallel
+
+        if is_parallel(trainer.model):
+            model = trainer.model.module
+        else:
+            model = trainer.model
+
+        if hasattr(model, 'save_pretrained'):
+            model.save_pretrained(
+                output_dir,
+                ModelFile.TORCH_MODEL_BIN_FILE,
+                save_function=save_checkpoint,
+                config=trainer.cfg.to_dict(),
+                with_meta=False)
 
     def after_train_iter(self, trainer):
         if self.by_epoch:
@@ -133,7 +135,7 @@ class BestCkptSaverHook(CheckpointHook):
         save_dir (str): Output directory to save best checkpoint.
     """
 
-    PRIORITY = Priority.NORMAL
+    PRIORITY = Priority.LOW
     rule_map = {'max': lambda x, y: x > y, 'min': lambda x, y: x < y}
 
     def __init__(self,
@@ -141,9 +143,11 @@ class BestCkptSaverHook(CheckpointHook):
                  rule='max',
                  by_epoch=True,
                  save_optimizer=True,
-                 save_dir=None):
+                 save_dir=None,
+                 interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
+            interval=interval,
             by_epoch=by_epoch,
             save_optimizer=save_optimizer,
             save_dir=save_dir,
diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py
index 75cc226c..1c567f1c 100644
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -199,14 +199,14 @@ class Hook:
         Whether to reach the last epoch
         Returns: bool
         """
-        return trainer.epoch + 1 == trainer._max_epochs
+        return trainer.epoch + 1 == trainer.max_epochs
 
     def is_last_iter(self, trainer):
         """
         Whether to reach the last iteration in the entire training process
         Returns: bool
         """
-        return trainer.iter + 1 == trainer._max_iters
+        return trainer.iter + 1 == trainer.max_iters
 
     def get_triggered_stages(self):
         trigger_stages = set()
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index 8b9d027a..425d3312 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -40,7 +40,8 @@ def weights_to_cpu(state_dict):
 def save_checkpoint(model: torch.nn.Module,
                     filename: str,
                     optimizer: Optional[Optimizer] = None,
-                    meta: Optional[dict] = None) -> None:
+                    meta: Optional[dict] = None,
+                    with_meta: bool = True) -> None:
     """Save checkpoint to file.
 
     The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
@@ -65,10 +66,14 @@ def save_checkpoint(model: torch.nn.Module,
         # save class name to the meta
         meta.update(CLASSES=model.CLASSES)
 
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(model.state_dict())
-    }
+    if with_meta:
+        checkpoint = {
+            'meta': meta,
+            'state_dict': weights_to_cpu(model.state_dict())
+        }
+    else:
+        checkpoint = weights_to_cpu(model.state_dict())
+
     # save optimizer state dict in the checkpoint
     if isinstance(optimizer, Optimizer):
         checkpoint['optimizer'] = optimizer.state_dict()
@@ -141,7 +146,7 @@ def save_pretrained(model,
 
     # Save the ckpt to the save directory
     try:
-        save_function(model, output_ckpt_path)
+        save_function(model, output_ckpt_path, **kwargs)
     except Exception as e:
         raise Exception(
             f'During saving checkpoints, the error of "{type(e).__name__} '
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 42985db6..7d972118 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -9,6 +9,7 @@ import sys
 import tempfile
 import types
 from pathlib import Path
+from types import FunctionType
 from typing import Dict, Union
 
 import addict
@@ -638,6 +639,8 @@ class JSONIteratorEncoder(json.JSONEncoder):
     """
 
     def default(self, obj):
+        if isinstance(obj, FunctionType):
+            return None
         try:
             iterable = iter(obj)
         except TypeError:
diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py
index 8cdfdf01..a561effe 100644
--- a/tests/trainers/test_finetune_text_generation.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -128,15 +128,14 @@ class TestFinetuneTextGeneration(unittest.TestCase):
 
     @unittest.skip
     def test_finetune_cnndm(self):
-        from datasets import load_dataset
-        dataset_dict = load_dataset('ccdv/cnn_dailymail', '3.0.0')
-        train_dataset = dataset_dict['train'] \
-            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
-            .remove_columns('id')
-        eval_dataset = dataset_dict['validation'] \
-            .rename_columns({'article': 'src_txt', 'highlights': 'tgt_txt'}) \
-            .remove_columns('id')
-        num_warmup_steps = 2000
+        from modelscope.msdatasets import MsDataset
+        dataset_dict = MsDataset.load('dureader_robust_qg')
+        train_dataset = dataset_dict['train'].to_hf_dataset() \
+            .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
+        eval_dataset = dataset_dict['validation'].to_hf_dataset() \
+            .rename_columns({'text1': 'src_txt', 'text2': 'tgt_txt'})
+        num_warmup_steps = 200
+        os.environ['LOCAL_RANK'] = '0'
 
         def noam_lambda(current_step: int):
             current_step += 1
@@ -154,12 +153,11 @@ class TestFinetuneTextGeneration(unittest.TestCase):
             return cfg
 
         kwargs = dict(
-            model=self.model_id,
+            model='damo/nlp_palm2.0_text-generation_chinese-base',
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             work_dir=self.tmp_dir,
-            cfg_modify_fn=cfg_modify_fn,
-            model_revision='beta')
+            cfg_modify_fn=cfg_modify_fn)
         trainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()

From b870e4eed541405380c6bbca78e44a06f947aae7 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Mon, 5 Sep 2022 13:26:30 +0800
Subject: [PATCH 20/28] [to #42322933] test: use custom config to reduce test
 time         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10011826
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

、
---
 modelscope/models/audio/ans/complex_nn.py |  6 +++---
 modelscope/models/audio/ans/unet.py       |  5 +++--
 tests/trainers/audio/test_ans_trainer.py  | 10 +++++++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
index c61446c2..9768eff7 100644
--- a/modelscope/models/audio/ans/complex_nn.py
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -1,7 +1,7 @@
 """
-class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d are the work of
-Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
-from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+The implementation of class ComplexConv2d, ComplexConvTranspose2d and ComplexBatchNorm2d
+ here is modified based on Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 
 """
 import torch
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
index ae66eb69..3a9c5549 100644
--- a/modelscope/models/audio/ans/unet.py
+++ b/modelscope/models/audio/ans/unet.py
@@ -1,6 +1,7 @@
 """
-Based on the work of Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft ).
-from https://github.com/sweetcocoa/DeepComplexUNetPyTorch
+The implementation here is modified based on
+ Jongho Choi(sweetcocoa@snu.ac.kr / Seoul National Univ., ESTsoft )
+and publicly available at https://github.com/sweetcocoa/DeepComplexUNetPyTorch
 """
 import torch
 import torch.nn as nn
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
index 176c811f..ed8cd1fe 100644
--- a/tests/trainers/audio/test_ans_trainer.py
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -8,12 +8,14 @@ from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.audio.audio_utils import to_segment
+from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
 SEGMENT_LENGTH_TEST = 640
 
 
 class TestANSTrainer(unittest.TestCase):
+    REVISION = 'beta'
 
     def setUp(self):
         self.tmp_dir = tempfile.TemporaryDirectory().name
@@ -21,6 +23,11 @@ class TestANSTrainer(unittest.TestCase):
             os.makedirs(self.tmp_dir)
 
         self.model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        cfg = read_config(self.model_id, revision=self.REVISION)
+        cfg.train.max_epochs = 2
+        cfg.train.dataloader.batch_size_per_gpu = 1
+        self.cfg_file = os.path.join(self.tmp_dir, 'train_config.json')
+        cfg.dump(self.cfg_file)
 
         hf_ds = MsDataset.load(
             'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
@@ -39,12 +46,13 @@ class TestANSTrainer(unittest.TestCase):
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
-            model_revision='beta',
+            model_revision=self.REVISION,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
             max_epochs=2,
             train_iters_per_epoch=2,
             val_iters_per_epoch=1,
+            cfg_file=self.cfg_file,
             work_dir=self.tmp_dir)
 
         trainer = build_trainer(

From c25e60c67dc7891a21065e912b30e276c77ccf7e Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Mon, 5 Sep 2022 13:52:54 +0800
Subject: [PATCH 21/28] [to #42322933]add lazy load         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10011795
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * [to #42322933] 新增FER人脸属性识别
---
 .../facial_expression_recognition/__init__.py | 20 +++++++++++++++++++
 .../fer/__init__.py                           |  2 ++
 modelscope/pipelines/cv/__init__.py           |  4 ++++
 .../facial_expression_recognition_pipeline.py |  2 +-
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/cv/facial_expression_recognition/__init__.py b/modelscope/models/cv/facial_expression_recognition/__init__.py
index e69de29b..35a15d18 100644
--- a/modelscope/models/cv/facial_expression_recognition/__init__.py
+++ b/modelscope/models/cv/facial_expression_recognition/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .fer import FacialExpressionRecognition
+
+else:
+    _import_structure = {'fer': ['FacialExpressionRecognition']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
index e69de29b..2546035b 100644
--- a/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
+++ b/modelscope/models/cv/facial_expression_recognition/fer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .facial_expression_recognition import FacialExpressionRecognition
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index d3dba978..ac1ed82c 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -47,6 +47,8 @@ if TYPE_CHECKING:
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -105,6 +107,8 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'facial_expression_recognition_pipelin':
+        ['FacialExpressionRecognitionPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index 4a80878c..c5577dcf 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -8,7 +8,7 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_recognition.align_face import align_face
-from modelscope.models.cv.facial_expression_recognition.fer.facial_expression_recognition import \
+from modelscope.models.cv.facial_expression_recognition import \
     FacialExpressionRecognition
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline

From f4ca0b8aabe916d010f62dab685625cd3c84c28a Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Mon, 5 Sep 2022 15:54:57 +0800
Subject: [PATCH 22/28] [to #42322933]add lazy import         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10012143

---
 .../models/cv/face_detection/__init__.py      | 22 +++++++++++++++++++
 .../cv/face_detection/retinaface/__init__.py  |  1 +
 modelscope/pipelines/cv/__init__.py           |  2 ++
 .../cv/retina_face_detection_pipeline.py      |  7 ++++--
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index e69de29b..a3c47164 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .retinaface import RetinaFaceDetection
+
+else:
+    _import_structure = {
+        'retinaface': ['RetinaFaceDetection'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/face_detection/retinaface/__init__.py b/modelscope/models/cv/face_detection/retinaface/__init__.py
index e69de29b..779aaf1c 100644
--- a/modelscope/models/cv/face_detection/retinaface/__init__.py
+++ b/modelscope/models/cv/face_detection/retinaface/__init__.py
@@ -0,0 +1 @@
+from .detection import RetinaFaceDetection
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index ac1ed82c..960ed621 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -47,6 +47,7 @@ if TYPE_CHECKING:
     from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipleline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
+    from .retina_face_detection_pipeline import RetinaFaceDetectionPipeline
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
 
 else:
@@ -107,6 +108,7 @@ else:
         ['TextDrivenSegmentationPipeline'],
         'movie_scene_segmentation_pipeline':
         ['MovieSceneSegmentationPipeline'],
+        'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipelin':
         ['FacialExpressionRecognitionPipeline']
     }
diff --git a/modelscope/pipelines/cv/retina_face_detection_pipeline.py b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
index 20111c11..b8c64405 100644
--- a/modelscope/pipelines/cv/retina_face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/retina_face_detection_pipeline.py
@@ -1,10 +1,13 @@
 import os.path as osp
 from typing import Any, Dict
 
+import cv2
 import numpy as np
+import PIL
+import torch
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.face_detection.retinaface import detection
+from modelscope.models.cv.face_detection import RetinaFaceDetection
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -28,7 +31,7 @@ class RetinaFaceDetectionPipeline(Pipeline):
         super().__init__(model=model, **kwargs)
         ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_FILE)
         logger.info(f'loading model from {ckpt_path}')
-        detector = detection.RetinaFaceDetection(
+        detector = RetinaFaceDetection(
             model_path=ckpt_path, device=self.device)
         self.detector = detector
         logger.info('load model done')

From 042cff7d68dce03f12a010d8b3723395fccde998 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 5 Sep 2022 16:08:50 +0800
Subject: [PATCH 23/28] [to #44702084]fix: ci pip install domain in single
 commands, find with requirement install failed is complicated.         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10014958

    * [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated.
---
 .dev_scripts/ci_container_test.sh | 10 +++++-----
 .dev_scripts/citest.sh            | 19 -------------------
 tests/run_config.yaml             |  5 +----
 3 files changed, 6 insertions(+), 28 deletions(-)
 delete mode 100644 .dev_scripts/citest.sh

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index a53c08c6..194a48b3 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,8 +1,8 @@
-pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 pip install -r requirements/tests.txt
 
 git config --global --add safe.directory /Maas-lib
diff --git a/.dev_scripts/citest.sh b/.dev_scripts/citest.sh
deleted file mode 100644
index c6e0905f..00000000
--- a/.dev_scripts/citest.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-pip install -r requirements.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-
-pip install -r requirements/tests.txt
-# install numpy<=1.18 for tensorflow==1.15.x
-pip install "numpy<=1.18"
-
-# linter test
-# use internal project for pre-commit due to the network problem
-pre-commit run --all-files
-if [ $? -ne 0 ]; then
-    echo "linter test failed, please run 'pre-commit run --all-files' to check"
-    exit -1
-fi
-
-PYTHONPATH=. python tests/run.py
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index 591dcd66..f44053f6 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -1,7 +1,4 @@
-# envs option allows fine-grained control for test executoin, for example,
-# python tests/run.py --env pytorch
-# would only trigger exeutions of all pytorch cases.
-# envs option defaults to None for backward compatbility
+# isolate cases in env, we can install different dependencies in each env.
 isolated:  # test cases that may require excessive anmount of GPU memory, which will be executed in dedicagted process.
   - test_text_to_speech.py
   - test_multi_modal_embedding.py

From f660a119f02cbf767521eea322f96faf2bb883c8 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Mon, 5 Sep 2022 16:19:45 +0800
Subject: [PATCH 24/28] [to #42322933]Add resumable and large data upload.

CR Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9995250

1. add resumable dataset upload
2. add large data upload (up to 48.8TB)
---
 modelscope/msdatasets/ms_dataset.py         |  8 +------
 modelscope/msdatasets/utils/oss_utils.py    | 24 +++++++++++++++------
 modelscope/msdatasets/utils/upload_utils.py | 22 +++++++++----------
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 338c6333..28a95643 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -574,14 +574,8 @@ class MsDataset:
             None
 
         """
-        from modelscope.hub.api import HubApi
-        _hub_api = HubApi()
-        cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
         _upload_manager = DatasetUploadManager(
-            dataset_name=dataset_name,
-            namespace=namespace,
-            version=version,
-            cookies=cookies)
+            dataset_name=dataset_name, namespace=namespace, version=version)
         _upload_manager.upload(object_name, local_file_path)
 
     @staticmethod
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 63a1cf77..9a7040a1 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -18,6 +18,12 @@ class OssUtilities:
         self.oss_dir = oss_config['Dir']
         self.oss_backup_dir = oss_config['BackupDir']
 
+        self.upload_resumable_tmp_store = '/tmp/modelscope/tmp_dataset'
+        self.upload_multipart_threshold = 50 * 1024 * 1024
+        self.upload_part_size = 1 * 1024 * 1024
+        self.upload_num_threads = 4
+        self.upload_max_retries = 3
+
     @staticmethod
     def _percentage(consumed_bytes, total_bytes):
         if total_bytes:
@@ -42,21 +48,27 @@ class OssUtilities:
                 progress_callback=self._percentage)
         return local_path
 
-    def upload(self, oss_file_name: str, local_file_path: str) -> str:
-        max_retries = 3
+    def upload(self, oss_object_name: str, local_file_path: str) -> str:
         retry_count = 0
-        object_key = os.path.join(self.oss_dir, oss_file_name)
+        object_key = os.path.join(self.oss_dir, oss_object_name)
+        resumable_store = oss2.ResumableStore(
+            root=self.upload_resumable_tmp_store)
 
         while True:
             try:
                 retry_count += 1
-                self.bucket.put_object_from_file(
+                oss2.resumable_upload(
+                    self.bucket,
                     object_key,
                     local_file_path,
-                    progress_callback=self._percentage)
+                    store=resumable_store,
+                    multipart_threshold=self.upload_multipart_threshold,
+                    part_size=self.upload_part_size,
+                    progress_callback=self._percentage,
+                    num_threads=self.upload_num_threads)
                 break
             except Exception:
-                if retry_count >= max_retries:
+                if retry_count >= self.upload_max_retries:
                     raise
 
         return object_key
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index eff3aca0..fbe5c531 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,23 +1,21 @@
-from http.cookiejar import CookieJar
-
 from .oss_utils import OssUtilities
 
 
 class DatasetUploadManager(object):
 
-    def __init__(self, dataset_name: str, namespace: str, version: str,
-                 cookies: CookieJar):
+    def __init__(self, dataset_name: str, namespace: str, version: str):
         from modelscope.hub.api import HubApi
-        api = HubApi()
-        oss_config = api.get_dataset_access_config_session(
-            cookies=cookies,
+        _hub_api = HubApi()
+        _cookies = _hub_api.check_cookies_upload_data(use_cookies=True)
+        _oss_config = _hub_api.get_dataset_access_config_session(
+            cookies=_cookies,
             dataset_name=dataset_name,
             namespace=namespace,
             revision=version)
 
-        self.oss_utilities = OssUtilities(oss_config)
+        self.oss_utilities = OssUtilities(_oss_config)
 
-    def upload(self, oss_file_name: str, local_file_path: str) -> str:
-        oss_object_key = self.oss_utilities.upload(
-            oss_file_name=oss_file_name, local_file_path=local_file_path)
-        return oss_object_key
+    def upload(self, object_name: str, local_file_path: str) -> str:
+        object_key = self.oss_utilities.upload(
+            oss_object_name=object_name, local_file_path=local_file_path)
+        return object_key

From 4484dcaa04ca49b7e90954b032118922ee7811ba Mon Sep 17 00:00:00 2001
From: "liangting.zl" <liangting.zl@alibaba-inc.com>
Date: Mon, 5 Sep 2022 16:42:40 +0800
Subject: [PATCH 25/28] [to #42322933]  feat: add hand keypoints pipeline      
   Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9961906

    * feat: add hand keypoints pipeline
---
 data/test/images/hand_keypoints.jpg           |  3 ++
 modelscope/metainfo.py                        |  1 +
 modelscope/outputs.py                         | 15 ++++++
 modelscope/pipelines/builder.py               |  3 ++
 modelscope/pipelines/cv/__init__.py           |  2 +
 .../cv/hand_2d_keypoints_pipeline.py          | 51 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 tests/pipelines/test_hand_2d_keypoints.py     | 45 ++++++++++++++++
 8 files changed, 121 insertions(+)
 create mode 100644 data/test/images/hand_keypoints.jpg
 create mode 100644 modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
 create mode 100644 tests/pipelines/test_hand_2d_keypoints.py

diff --git a/data/test/images/hand_keypoints.jpg b/data/test/images/hand_keypoints.jpg
new file mode 100644
index 00000000..cb445c26
--- /dev/null
+++ b/data/test/images/hand_keypoints.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c05d58edee7398de37b8e479410676d6b97cfde69cc003e8356a348067e71988
+size 7750
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 47608d02..3ac2f2df 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -112,6 +112,7 @@ class Pipelines(object):
     hicossl_video_embedding = 'hicossl-s3dg-video_embedding'
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
     body_3d_keypoints = 'canonical_body-3d-keypoints_video'
+    hand_2d_keypoints = 'hrnetv2w18_hand-2d-keypoints_image'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
     easycv_detection = 'easycv-detection'
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 50668693..c6a7a619 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -219,6 +219,21 @@ TASK_OUTPUTS = {
     # }
     Tasks.body_3d_keypoints: [OutputKeys.POSES],
 
+    # 2D hand keypoints result for single sample
+    # {
+    #     "keypoints": [
+    #                     [[x, y, score] * 21],
+    #                     [[x, y, score] * 21],
+    #                     [[x, y, score] * 21],
+    #                  ],
+    #     "boxes": [
+    #                 [x1, y1, x2, y2],
+    #                 [x1, y1, x2, y2],
+    #                 [x1, y1, x2, y2],
+    #             ]
+    # }
+    Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
+
     # video single object tracking result for single video
     # {
     #   "boxes": [
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 6f901154..9f265fb8 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -99,6 +99,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                               'damo/cv_hrnetv2w32_body-2d-keypoints_image'),
     Tasks.body_3d_keypoints: (Pipelines.body_3d_keypoints,
                               'damo/cv_canonical_body-3d-keypoints_video'),
+    Tasks.hand_2d_keypoints:
+    (Pipelines.hand_2d_keypoints,
+     'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
     Tasks.face_detection: (Pipelines.face_detection,
                            'damo/cv_resnet_facedetection_scrfd10gkps'),
     Tasks.face_recognition: (Pipelines.face_recognition,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 960ed621..72a225ff 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
     from .animal_recognition_pipeline import AnimalRecognitionPipeline
     from .body_2d_keypoints_pipeline import Body2DKeypointsPipeline
     from .body_3d_keypoints_pipeline import Body3DKeypointsPipeline
+    from .hand_2d_keypoints_pipeline import Hand2DKeypointsPipeline
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
     from .hicossl_video_embedding_pipeline import HICOSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
@@ -57,6 +58,7 @@ else:
         'animal_recognition_pipeline': ['AnimalRecognitionPipeline'],
         'body_2d_keypoints_pipeline': ['Body2DKeypointsPipeline'],
         'body_3d_keypoints_pipeline': ['Body3DKeypointsPipeline'],
+        'hand_2d_keypoints_pipeline': ['Hand2DKeypointsPipeline'],
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
         'hicossl_video_embedding_pipeline': ['HICOSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
diff --git a/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
new file mode 100644
index 00000000..db66f5d2
--- /dev/null
+++ b/modelscope/pipelines/cv/hand_2d_keypoints_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from .easycv_pipelines.base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.hand_2d_keypoints, module_name=Pipelines.hand_2d_keypoints)
+class Hand2DKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for hand pose keypoint task."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+        self.model_dir = model
+        super(Hand2DKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def _build_predict_op(self):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+        detection_predictor_type = self.cfg['DETECTION']['type']
+        detection_model_path = os.path.join(
+            self.model_dir, self.cfg['DETECTION']['model_path'])
+        detection_cfg_file = os.path.join(self.model_dir,
+                                          self.cfg['DETECTION']['config_file'])
+        detection_score_threshold = self.cfg['DETECTION']['score_threshold']
+        self.cfg.pipeline.predictor_config[
+            'detection_predictor_config'] = dict(
+                type=detection_predictor_type,
+                model_path=detection_model_path,
+                config_file=detection_cfg_file,
+                score_threshold=detection_score_threshold)
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config
+        })
+        return pipeline_op
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 32185fb9..47d38dd7 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -27,6 +27,7 @@ class CVTasks(object):
     face_image_generation = 'face-image-generation'
     body_2d_keypoints = 'body-2d-keypoints'
     body_3d_keypoints = 'body-3d-keypoints'
+    hand_2d_keypoints = 'hand-2d-keypoints'
     general_recognition = 'general-recognition'
 
     image_classification = 'image-classification'
diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py
new file mode 100644
index 00000000..86cd2d06
--- /dev/null
+++ b/tests/pipelines/test_hand_2d_keypoints.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class Hand2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_hand_2d_keypoints(self):
+        img_path = 'data/test/images/hand_keypoints.jpg'
+        model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
+
+        hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id)
+        outputs = hand_keypoint(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertIn(OutputKeys.KEYPOINTS, results.keys())
+        self.assertIn(OutputKeys.BOXES, results.keys())
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3)
+        self.assertEqual(results[OutputKeys.BOXES].shape[1], 4)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_hand_2d_keypoints_with_default_model(self):
+        img_path = 'data/test/images/hand_keypoints.jpg'
+
+        hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints)
+        outputs = hand_keypoint(img_path)
+        self.assertEqual(len(outputs), 1)
+
+        results = outputs[0]
+        self.assertIn(OutputKeys.KEYPOINTS, results.keys())
+        self.assertIn(OutputKeys.BOXES, results.keys())
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
+        self.assertEqual(results[OutputKeys.KEYPOINTS].shape[2], 3)
+        self.assertEqual(results[OutputKeys.BOXES].shape[1], 4)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 83dbf713020b7c45cd22b0ebcc366eb73ec5d899 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 5 Sep 2022 17:38:05 +0800
Subject: [PATCH 26/28] [to #44702084]fix: ci pip install domain in single
 commands, find with requirement install failed is complicated.         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10019738

    * [to #44702084]fix: ci pip install domain in single commands, find with requirement install failed is complicated.
---
 .dev_scripts/ci_container_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 194a48b3..129a6c25 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,4 +1,4 @@
-awk -F: '/^[^#]/ { print $1 }' requirements.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html

From 3d3f9b45377abad27b9e9272ee294a2f2ee50ea9 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Mon, 5 Sep 2022 17:51:22 +0800
Subject: [PATCH 27/28] [to #42322933] fix checkpoint format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复 palm，gpt3，mplug 模型存在的 finetune 后保存 checkpoint 与原有 checkpoint key 字段存在区别无法使用 from_pretrained 导入的问题
2. 调整 test_finetune_mplug.py 为只保存训练结束时的 checkpoint，减少 ci 耗时
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10016517
---
 .../multi_modal/mplug/modeling_mplug.py       | 10 +++---
 modelscope/models/nlp/gpt3/modeling_gpt3.py   |  4 +++
 .../models/nlp/palm_v2/modeling_palm.py       | 16 ++++-----
 tests/trainers/test_finetune_mplug.py         | 33 ++++++++++---------
 4 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 78f60f9b..f469c218 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -1867,11 +1867,13 @@ class MPlug(PreTrainedModel):
                                            ModelFile.TORCH_MODEL_BIN_FILE)
             checkpoint = torch.load(checkpoint_path, map_location='cpu')
             if 'model' in checkpoint:
-                state_dict = checkpoint['model']
-            else:
-                state_dict = checkpoint['module']
+                checkpoint = checkpoint['model']
+            checkpoint = {
+                k.replace('model.', ''): v
+                for k, v in checkpoint.items()
+            }
 
-            msg = model.load_state_dict(state_dict, strict=False)
+            msg = model.load_state_dict(checkpoint, strict=False)
             print('load checkpoint from %s' % checkpoint_path)
             print(msg)
         return model
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index 4e30f697..69e9ba7c 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -339,5 +339,9 @@ class GPT3Model(PreTrainedModel):
         state_dict_file = os.path.join(pretrained_model_name_or_path,
                                        ModelFile.TORCH_MODEL_BIN_FILE)
         state_dict = torch.load(state_dict_file)
+        state_dict = {
+            k.replace('model.language_model', 'language_model'): v
+            for k, v in state_dict.items()
+        }
         model.load_state_dict(state_dict)
         return model
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/modeling_palm.py
index ff6fd732..99b00454 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/modeling_palm.py
@@ -592,11 +592,11 @@ class AbsSummarizer(PalmPreTrainedModel):  # Model
         self.generator.dense.weight = self.decoder.embeddings.weight
 
         if checkpoint is not None:
-            for key in list(checkpoint['model'].keys()):
-                checkpoint['model'][key.replace('module.',
-                                                '')] = checkpoint['model'][key]
-            msg = self.load_state_dict(checkpoint['model'], strict=False)
-            print(msg)
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            for key in list(checkpoint.keys()):
+                checkpoint[key.replace('model.palm.', '')] = checkpoint[key]
+            self.load_state_dict(checkpoint, strict=False)
         else:
             for module in self.decoder.modules():
                 if isinstance(module, (nn.Linear, nn.Embedding)):
@@ -734,7 +734,7 @@ class PalmForConditionalGeneration(PalmPreTrainedModel):
         return addict.Dict(loss=loss)
 
 
-class Translator(nn.Module):
+class Translator(object):
     """
     Uses a model to translate a batch of sentences.
     """
@@ -1298,8 +1298,8 @@ class Translator(nn.Module):
 
         return results
 
-    def forward(self, input_ids: torch.Tensor,
-                attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
+    def __call__(self, input_ids: torch.Tensor,
+                 attention_mask: torch.Tensor) -> Dict[str, torch.Tensor]:
         batch = self.Batch(
             batch_size=input_ids.size()[0],
             src=input_ids,
diff --git a/tests/trainers/test_finetune_mplug.py b/tests/trainers/test_finetune_mplug.py
index b46dbf45..72196fba 100644
--- a/tests/trainers/test_finetune_mplug.py
+++ b/tests/trainers/test_finetune_mplug.py
@@ -41,6 +41,18 @@ class TestFinetuneMPlug(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    def _cfg_modify_fn(self, cfg):
+        cfg.train.hooks = [{
+            'type': 'CheckpointHook',
+            'interval': self.max_epochs
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }]
+        return cfg
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer_with_caption(self):
         kwargs = dict(
@@ -48,15 +60,12 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
 
         trainer: EpochBasedTrainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(self.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_caption_with_model_and_args(self):
@@ -86,15 +95,12 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
 
         trainer: EpochBasedTrainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(self.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_vqa_with_model_and_args(self):
@@ -124,15 +130,12 @@ class TestFinetuneMPlug(unittest.TestCase):
             train_dataset=self.train_dataset,
             eval_dataset=self.test_dataset,
             max_epochs=self.max_epochs,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=self._cfg_modify_fn)
 
         trainer: EpochBasedTrainer = build_trainer(
             name=Trainers.nlp_base_trainer, default_args=kwargs)
         trainer.train()
-        results_files = os.listdir(self.tmp_dir)
-        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-        for i in range(self.max_epochs):
-            self.assertIn(f'epoch_{i+1}.pth', results_files)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_retrieval_with_model_and_args(self):

From e365023862995b921f74d902a69667933fa58060 Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Mon, 5 Sep 2022 19:36:46 +0800
Subject: [PATCH 28/28] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dprocessor=E8=BE=93?=
 =?UTF-8?q?=E5=87=BAtorch.tensor=E6=97=B6=E8=A2=AB=E8=BD=AC=E4=B8=BAnumpy?=
 =?UTF-8?q?=E7=9A=84=E5=BC=82=E5=B8=B8=20=20=20=20=20=20=20=20=20Link:=20h?=
 =?UTF-8?q?ttps://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/100218?=
 =?UTF-8?q?02?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * fix to_torch_dataset
---
 modelscope/msdatasets/ms_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 28a95643..691db4fe 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -70,12 +70,12 @@ class MsIterableDataset(torch.utils.data.IterableDataset):
         for idx in range(iter_start, iter_end):
             item_dict = self.dataset[idx]
             res = {
-                k: np.array(item_dict[k])
+                k: torch.tensor(item_dict[k])
                 for k in self.columns if k in self.retained_columns
             }
             for preprocessor in self.preprocessor_list:
                 res.update({
-                    k: np.array(v)
+                    k: torch.tensor(v)
                     for k, v in preprocessor(item_dict).items()
                     if k in self.retained_columns
                 })