From 9b6e709d3e4346b1bd72e3bb11afc30efad08bad Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sun, 9 Oct 2022 17:34:21 +0800
Subject: [PATCH 001/129] [to #44902099] update license

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10335285
---
 .../multi_modal/mmr/models/clip_for_mm_video_embedding.py       | 2 +-
 .../models/multi_modal/mmr/models/dynamic_inverted_softmax.py   | 2 +-
 modelscope/models/multi_modal/mmr/models/tokenization_clip.py   | 2 +-
 modelscope/pipelines/audio/asr_inference_pipeline.py            | 1 +
 modelscope/pipelines/audio/kws_kwsbp_pipeline.py                | 1 +
 modelscope/pipelines/cv/crowd_counting_pipeline.py              | 1 +
 modelscope/pipelines/cv/face_image_generation_pipeline.py       | 1 +
 modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py     | 1 +
 modelscope/pipelines/cv/image_color_enhance_pipeline.py         | 1 +
 modelscope/pipelines/cv/image_colorization_pipeline.py          | 1 +
 modelscope/pipelines/cv/image_denoise_pipeline.py               | 1 +
 modelscope/pipelines/cv/image_matting_pipeline.py               | 1 +
 modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py  | 1 +
 modelscope/pipelines/cv/image_super_resolution_pipeline.py      | 1 +
 modelscope/pipelines/cv/image_to_image_generate_pipeline.py     | 1 +
 modelscope/pipelines/cv/image_to_image_translation_pipeline.py  | 1 +
 modelscope/pipelines/cv/ocr_detection_pipeline.py               | 1 +
 modelscope/pipelines/cv/ocr_recognition_pipeline.py             | 1 +
 modelscope/pipelines/cv/product_retrieval_embedding_pipeline.py | 1 +
 modelscope/pipelines/cv/video_summarization_pipeline.py         | 1 +
 modelscope/pipelines/nlp/feature_extraction_pipeline.py         | 1 +
 modelscope/pipelines/nlp/sequence_classification_pipeline.py    | 1 +
 modelscope/pipelines/nlp/text2text_generation_pipeline.py       | 1 +
 23 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 5e8e2e7a..2a72985f 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,4 +1,4 @@
-# The implementation is adopated from the CLIP4Clip implementation,
+# The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import random
diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
index 253a847c..c2d96275 100644
--- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
+++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
@@ -1,4 +1,4 @@
-# The implementation is adopated from the CLIP4Clip implementation,
+# The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import numpy as np
diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
index 4e2c9b15..97ee7156 100644
--- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
@@ -1,4 +1,4 @@
-# The implementation is adopated from the CLIP4Clip implementation,
+# The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import gzip
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index 282d1184..4e8b658d 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import yaml
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 866b8d0b..450a12bb 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, List, Union
 
diff --git a/modelscope/pipelines/cv/crowd_counting_pipeline.py b/modelscope/pipelines/cv/crowd_counting_pipeline.py
index 3143825b..93fffdf2 100644
--- a/modelscope/pipelines/cv/crowd_counting_pipeline.py
+++ b/modelscope/pipelines/cv/crowd_counting_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/face_image_generation_pipeline.py b/modelscope/pipelines/cv/face_image_generation_pipeline.py
index 405c9a4b..f00d639e 100644
--- a/modelscope/pipelines/cv/face_image_generation_pipeline.py
+++ b/modelscope/pipelines/cv/face_image_generation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
index 5e4cd4c6..21af2f75 100644
--- a/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
+++ b/modelscope/pipelines/cv/hicossl_video_embedding_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os.path as osp
 from typing import Any, Dict
diff --git a/modelscope/pipelines/cv/image_color_enhance_pipeline.py b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
index 40777d60..d21d879c 100644
--- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py
+++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/cv/image_colorization_pipeline.py b/modelscope/pipelines/cv/image_colorization_pipeline.py
index 0fea729d..cd385024 100644
--- a/modelscope/pipelines/cv/image_colorization_pipeline.py
+++ b/modelscope/pipelines/cv/image_colorization_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import cv2
diff --git a/modelscope/pipelines/cv/image_denoise_pipeline.py b/modelscope/pipelines/cv/image_denoise_pipeline.py
index 64aa3bc9..a11abf36 100644
--- a/modelscope/pipelines/cv/image_denoise_pipeline.py
+++ b/modelscope/pipelines/cv/image_denoise_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
 import torch
diff --git a/modelscope/pipelines/cv/image_matting_pipeline.py b/modelscope/pipelines/cv/image_matting_pipeline.py
index d7b7fc3c..fb5d8f8b 100644
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
index 87e692e8..3eec6526 100644
--- a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
+++ b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/image_super_resolution_pipeline.py b/modelscope/pipelines/cv/image_super_resolution_pipeline.py
index 657acc41..ca8f3209 100644
--- a/modelscope/pipelines/cv/image_super_resolution_pipeline.py
+++ b/modelscope/pipelines/cv/image_super_resolution_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import cv2
diff --git a/modelscope/pipelines/cv/image_to_image_generate_pipeline.py b/modelscope/pipelines/cv/image_to_image_generate_pipeline.py
index 2a3881e7..4f0121dd 100644
--- a/modelscope/pipelines/cv/image_to_image_generate_pipeline.py
+++ b/modelscope/pipelines/cv/image_to_image_generate_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/image_to_image_translation_pipeline.py b/modelscope/pipelines/cv/image_to_image_translation_pipeline.py
index 78901c9b..e5f853ca 100644
--- a/modelscope/pipelines/cv/image_to_image_translation_pipeline.py
+++ b/modelscope/pipelines/cv/image_to_image_translation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import io
 import os.path as osp
 import sys
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 07231efa..292ec2c5 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/ocr_recognition_pipeline.py b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
index c20d020c..e81467a1 100644
--- a/modelscope/pipelines/cv/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_recognition_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os.path as osp
 from typing import Any, Dict
diff --git a/modelscope/pipelines/cv/product_retrieval_embedding_pipeline.py b/modelscope/pipelines/cv/product_retrieval_embedding_pipeline.py
index 2614983b..0164a998 100644
--- a/modelscope/pipelines/cv/product_retrieval_embedding_pipeline.py
+++ b/modelscope/pipelines/cv/product_retrieval_embedding_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py
index 001780e1..25ea1e7c 100644
--- a/modelscope/pipelines/cv/video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/video_summarization_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/nlp/feature_extraction_pipeline.py b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
index 3af0c28d..e94e4337 100644
--- a/modelscope/pipelines/nlp/feature_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/feature_extraction_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
index 8d0e1dcd..69f6217a 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 
 import numpy as np
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
index 9ccd00f4..21aacf54 100644
--- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
 import torch

From 65bea053afe17373c1498cc466b59e9514a5febc Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sun, 9 Oct 2022 18:59:07 +0800
Subject: [PATCH 002/129] [to #44902165] feat: Add input signature for pipeline

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10233235

* add input definition for pipeline


* support multiple input format for single pipeline


*  change param for body_3d_keypoints
---
 modelscope/pipeline_inputs.py                 | 236 ++++++++++++++++++
 modelscope/pipelines/base.py                  |  39 ++-
 .../cv/body_3d_keypoints_pipeline.py          |  21 +-
 tests/pipelines/test_body_3d_keypoints.py     |  12 +-
 tests/pipelines/test_mplug_tasks.py           |   4 +-
 tests/pipelines/test_ofa_tasks.py             |  18 +-
 6 files changed, 296 insertions(+), 34 deletions(-)
 create mode 100644 modelscope/pipeline_inputs.py

diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
new file mode 100644
index 00000000..de9814a7
--- /dev/null
+++ b/modelscope/pipeline_inputs.py
@@ -0,0 +1,236 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+from PIL import Image
+
+from modelscope.models.base.base_head import Input
+from modelscope.utils.constant import Tasks
+
+
+class InputKeys(object):
+    IMAGE = 'image'
+    TEXT = 'text'
+    VIDEO = 'video'
+
+
+class InputType(object):
+    IMAGE = 'image'
+    TEXT = 'text'
+    AUDIO = 'audio'
+    VIDEO = 'video'
+    BOX = 'box'
+    DICT = 'dict'
+    LIST = 'list'
+    INT = 'int'
+
+
+INPUT_TYPE = {
+    InputType.IMAGE: (str, np.ndarray, Image.Image),
+    InputType.TEXT: str,
+    InputType.AUDIO: (str, np.ndarray),
+    InputType.VIDEO: (str, np.ndarray, cv2.VideoCapture),
+    InputType.BOX: (list, np.ndarray),
+    InputType.DICT: (dict, type(None)),
+    InputType.LIST: (list, type(None)),
+    InputType.INT: int,
+}
+
+
+def check_input_type(input_type, input):
+    expected_type = INPUT_TYPE[input_type]
+    assert isinstance(input, expected_type), \
+        f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
+
+
+TASK_INPUTS = {
+    # if task input is single var, value is  InputType
+    # if task input is a tuple,  value is tuple of InputType
+    # if task input is a dict, value is a dict of InputType, where key
+    # equals the one needed in pipeline input dict
+    # if task input is a list, value is a set of input format, in which
+    # each elements corresponds to one input format as described above.
+    # ============ vision tasks ===================
+    Tasks.ocr_detection:
+    InputType.IMAGE,
+    Tasks.ocr_recognition:
+    InputType.IMAGE,
+    Tasks.face_2d_keypoints:
+    InputType.IMAGE,
+    Tasks.face_detection:
+    InputType.IMAGE,
+    Tasks.facial_expression_recognition:
+    InputType.IMAGE,
+    Tasks.face_recognition:
+    InputType.IMAGE,
+    Tasks.human_detection:
+    InputType.IMAGE,
+    Tasks.face_image_generation:
+    InputType.INT,
+    Tasks.image_classification:
+    InputType.IMAGE,
+    Tasks.image_object_detection:
+    InputType.IMAGE,
+    Tasks.image_segmentation:
+    InputType.IMAGE,
+    Tasks.portrait_matting:
+    InputType.IMAGE,
+
+    # image editing task result for a single image
+    Tasks.skin_retouching:
+    InputType.IMAGE,
+    Tasks.image_super_resolution:
+    InputType.IMAGE,
+    Tasks.image_colorization:
+    InputType.IMAGE,
+    Tasks.image_color_enhancement:
+    InputType.IMAGE,
+    Tasks.image_denoising:
+    InputType.IMAGE,
+    Tasks.image_portrait_enhancement:
+    InputType.IMAGE,
+    Tasks.crowd_counting:
+    InputType.IMAGE,
+
+    # image generation task result for a single image
+    Tasks.image_to_image_generation:
+    InputType.IMAGE,
+    Tasks.image_to_image_translation:
+    InputType.IMAGE,
+    Tasks.image_style_transfer:
+    InputType.IMAGE,
+    Tasks.image_portrait_stylization:
+    InputType.IMAGE,
+    Tasks.live_category:
+    InputType.VIDEO,
+    Tasks.action_recognition:
+    InputType.VIDEO,
+    Tasks.body_2d_keypoints:
+    InputType.IMAGE,
+    Tasks.body_3d_keypoints:
+    InputType.VIDEO,
+    Tasks.hand_2d_keypoints:
+    InputType.IMAGE,
+    Tasks.video_single_object_tracking: (InputType.VIDEO, InputType.BOX),
+    Tasks.video_category:
+    InputType.VIDEO,
+    Tasks.product_retrieval_embedding:
+    InputType.IMAGE,
+    Tasks.video_embedding:
+    InputType.VIDEO,
+    Tasks.virtual_try_on: (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE),
+    Tasks.text_driven_segmentation: {
+        InputKeys.IMAGE: InputType.IMAGE,
+        InputKeys.TEXT: InputType.TEXT
+    },
+    Tasks.shop_segmentation:
+    InputType.IMAGE,
+    Tasks.movie_scene_segmentation:
+    InputType.VIDEO,
+
+    # ============ nlp tasks ===================
+    Tasks.text_classification: [
+        InputType.TEXT,
+        (InputType.TEXT, InputType.TEXT),
+        {
+            'text': InputType.TEXT,
+            'text2': InputType.TEXT
+        },
+    ],
+    Tasks.sentence_similarity: (InputType.TEXT, InputType.TEXT),
+    Tasks.nli: (InputType.TEXT, InputType.TEXT),
+    Tasks.sentiment_classification:
+    InputType.TEXT,
+    Tasks.zero_shot_classification:
+    InputType.TEXT,
+    Tasks.relation_extraction:
+    InputType.TEXT,
+    Tasks.translation:
+    InputType.TEXT,
+    Tasks.word_segmentation:
+    InputType.TEXT,
+    Tasks.part_of_speech:
+    InputType.TEXT,
+    Tasks.named_entity_recognition:
+    InputType.TEXT,
+    Tasks.text_error_correction:
+    InputType.TEXT,
+    Tasks.sentence_embedding: {
+        'source_sentence': InputType.LIST,
+        'sentences_to_compare': InputType.LIST,
+    },
+    Tasks.passage_ranking: (InputType.TEXT, InputType.TEXT),
+    Tasks.text_generation:
+    InputType.TEXT,
+    Tasks.fill_mask:
+    InputType.TEXT,
+    Tasks.task_oriented_conversation: {
+        'user_input': InputType.TEXT,
+        'history': InputType.DICT,
+    },
+    Tasks.table_question_answering: {
+        'question': InputType.TEXT,
+        'history_sql': InputType.DICT,
+    },
+    Tasks.faq_question_answering: {
+        'query_set': InputType.LIST,
+        'support_set': InputType.LIST,
+    },
+
+    # ============ audio tasks ===================
+    Tasks.auto_speech_recognition:
+    InputType.AUDIO,
+    Tasks.speech_signal_process:
+    InputType.AUDIO,
+    Tasks.acoustic_echo_cancellation: {
+        'nearend_mic': InputType.AUDIO,
+        'farend_speech': InputType.AUDIO
+    },
+    Tasks.acoustic_noise_suppression:
+    InputType.AUDIO,
+    Tasks.text_to_speech:
+    InputType.TEXT,
+    Tasks.keyword_spotting:
+    InputType.AUDIO,
+
+    # ============ multi-modal tasks ===================
+    Tasks.image_captioning:
+    InputType.IMAGE,
+    Tasks.visual_grounding: {
+        'image': InputType.IMAGE,
+        'text': InputType.TEXT
+    },
+    Tasks.text_to_image_synthesis: {
+        'text': InputType.TEXT,
+    },
+    Tasks.multi_modal_embedding: {
+        'img': InputType.IMAGE,
+        'text': InputType.TEXT
+    },
+    Tasks.generative_multi_modal_embedding: {
+        'image': InputType.IMAGE,
+        'text': InputType.TEXT
+    },
+    Tasks.multi_modal_similarity: {
+        'img': InputType.IMAGE,
+        'text': InputType.TEXT
+    },
+    Tasks.visual_question_answering: {
+        'image': InputType.IMAGE,
+        'text': InputType.TEXT
+    },
+    Tasks.visual_entailment: {
+        'image': InputType.IMAGE,
+        'text': InputType.TEXT,
+        'text2': InputType.TEXT,
+    },
+    Tasks.action_detection:
+    InputType.VIDEO,
+    Tasks.image_reid_person:
+    InputType.IMAGE,
+    Tasks.video_inpainting: {
+        'video_input_path': InputType.TEXT,
+        'video_output_path': InputType.TEXT,
+        'mask_path': InputType.TEXT,
+    }
+}
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index c5db2b57..5732a9d7 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -13,6 +13,7 @@ import numpy as np
 from modelscope.models.base import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.outputs import TASK_OUTPUTS
+from modelscope.pipeline_inputs import TASK_INPUTS, check_input_type
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Frameworks, ModelFile
@@ -210,7 +211,7 @@ class Pipeline(ABC):
         preprocess_params = kwargs.get('preprocess_params', {})
         forward_params = kwargs.get('forward_params', {})
         postprocess_params = kwargs.get('postprocess_params', {})
-
+        self._check_input(input)
         out = self.preprocess(input, **preprocess_params)
         with device_placement(self.framework, self.device_name):
             if self.framework == Frameworks.torch:
@@ -225,6 +226,42 @@ class Pipeline(ABC):
         self._check_output(out)
         return out
 
+    def _check_input(self, input):
+        task_name = self.group_key
+        if task_name in TASK_INPUTS:
+            input_type = TASK_INPUTS[task_name]
+
+            # if multiple input formats are defined, we first
+            # found the one that match input data and check
+            if isinstance(input_type, list):
+                matched_type = None
+                for t in input_type:
+                    if type(t) == type(input):
+                        matched_type = t
+                        break
+                if matched_type is None:
+                    err_msg = 'input data format for current pipeline should be one of following: \n'
+                    for t in input_type:
+                        err_msg += f'{t}\n'
+                    raise ValueError(err_msg)
+                else:
+                    input_type = matched_type
+
+            if isinstance(input_type, str):
+                check_input_type(input_type, input)
+            elif isinstance(input_type, tuple):
+                for t, input_ele in zip(input_type, input):
+                    check_input_type(t, input_ele)
+            elif isinstance(input_type, dict):
+                for k in input_type.keys():
+                    # allow single input for multi-modal models
+                    if k in input:
+                        check_input_type(input_type[k], input[k])
+            else:
+                raise ValueError(f'invalid input_type definition {input_type}')
+        else:
+            logger.warning(f'task {task_name} input definition is missing')
+
     def _check_output(self, input):
         # this attribute is dynamically attached by registry
         # when cls is registered in registry using task name
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index 474c0e54..b0faa1e0 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -132,12 +132,7 @@ class Body3DKeypointsPipeline(Pipeline):
             device='gpu' if torch.cuda.is_available() else 'cpu')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        video_url = input.get('input_video')
-        self.output_video_path = input.get('output_video_path')
-        if self.output_video_path is None:
-            self.output_video_path = tempfile.NamedTemporaryFile(
-                suffix='.mp4').name
-
+        video_url = input
         video_frames = self.read_video_frames(video_url)
         if 0 == len(video_frames):
             res = {'success': False, 'msg': 'get video frame failed.'}
@@ -194,9 +189,13 @@ class Body3DKeypointsPipeline(Pipeline):
             pred_3d_pose = poses.data.cpu().numpy()[
                 0]  # [frame_num, joint_num, joint_dim]
 
+            output_video_path = kwargs.get('output_video', None)
+            if output_video_path is None:
+                output_video_path = tempfile.NamedTemporaryFile(
+                    suffix='.mp4').name
             if 'render' in self.keypoint_model_3d.cfg.keys():
-                self.render_prediction(pred_3d_pose)
-                res[OutputKeys.OUTPUT_VIDEO] = self.output_video_path
+                self.render_prediction(pred_3d_pose, output_video_path)
+                res[OutputKeys.OUTPUT_VIDEO] = output_video_path
 
             res[OutputKeys.POSES] = pred_3d_pose
             res[OutputKeys.TIMESTAMPS] = self.timestamps
@@ -252,12 +251,12 @@ class Body3DKeypointsPipeline(Pipeline):
         cap.release()
         return frames
 
-    def render_prediction(self, pose3d_cam_rr):
+    def render_prediction(self, pose3d_cam_rr, output_video_path):
         """render predict result 3d poses.
 
         Args:
             pose3d_cam_rr (nd.array): [frame_num, joint_num, joint_dim], 3d pose joints
-
+            output_video_path (str): output path for video
         Returns:
         """
         frame_num = pose3d_cam_rr.shape[0]
@@ -359,4 +358,4 @@ class Body3DKeypointsPipeline(Pipeline):
         # save mp4
         Writer = writers['ffmpeg']
         writer = Writer(fps=self.fps, metadata={}, bitrate=4096)
-        ani.save(self.output_video_path, writer=writer)
+        ani.save(output_video_path, writer=writer)
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index bde04f8e..6f27f12d 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -20,7 +20,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.body_3d_keypoints
 
     def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
-        output = pipeline(pipeline_input)
+        output = pipeline(pipeline_input, output_video='./result.mp4')
         poses = np.array(output[OutputKeys.POSES])
         print(f'result 3d points shape {poses.shape}')
 
@@ -28,10 +28,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_modelhub_with_video_file(self):
         body_3d_keypoints = pipeline(
             Tasks.body_3d_keypoints, model=self.model_id)
-        pipeline_input = {
-            'input_video': self.test_video,
-            'output_video_path': './result.mp4'
-        }
+        pipeline_input = self.test_video
         self.pipeline_inference(
             body_3d_keypoints, pipeline_input=pipeline_input)
 
@@ -42,10 +39,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
         if not cap.isOpened():
             raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
                             % (self.test_video))
-        pipeline_input = {
-            'input_video': cap,
-            'output_video_path': './result.mp4'
-        }
+        pipeline_input = self.test_video
         self.pipeline_inference(
             body_3d_keypoints, pipeline_input=pipeline_input)
 
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index a3ace62d..11c9798f 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -26,7 +26,7 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             model=model,
         )
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        result = pipeline_caption({'image': image})
+        result = pipeline_caption(image)
         print(result[OutputKeys.CAPTION])
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -35,7 +35,7 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_captioning,
             model='damo/mplug_image-captioning_coco_base_en')
         image = Image.open('data/test/images/image_mplug_vqa.jpg')
-        result = pipeline_caption({'image': image})
+        result = pipeline_caption(image)
         print(result[OutputKeys.CAPTION])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index e6638dfa..f8366508 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -34,7 +34,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             model=model,
         )
         image = 'data/test/images/image_captioning.png'
-        result = img_captioning({'image': image})
+        result = img_captioning(image)
         print(result[OutputKeys.CAPTION])
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -42,8 +42,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         img_captioning = pipeline(
             Tasks.image_captioning,
             model='damo/ofa_image-caption_coco_large_en')
-        result = img_captioning(
-            {'image': 'data/test/images/image_captioning.png'})
+        result = img_captioning('data/test/images/image_captioning.png')
         print(result[OutputKeys.CAPTION])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -52,8 +51,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/ofa_image-classification_imagenet_large_en')
         ofa_pipe = pipeline(Tasks.image_classification, model=model)
         image = 'data/test/images/image_classification.png'
-        input = {'image': image}
-        result = ofa_pipe(input)
+        result = ofa_pipe(image)
         print(result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -62,8 +60,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_classification,
             model='damo/ofa_image-classification_imagenet_large_en')
         image = 'data/test/images/image_classification.png'
-        input = {'image': image}
-        result = ofa_pipe(input)
+        result = ofa_pipe(image)
         print(result)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -99,8 +96,8 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         ofa_pipe = pipeline(Tasks.text_classification, model=model)
         text = 'One of our number will carry out your instructions minutely.'
         text2 = 'A member of my team will execute your orders with immense precision.'
-        input = {'text': text, 'text2': text2}
-        result = ofa_pipe(input)
+        result = ofa_pipe((text, text2))
+        result = ofa_pipe({'text': text, 'text2': text2})
         print(result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -110,8 +107,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             model='damo/ofa_text-classification_mnli_large_en')
         text = 'One of our number will carry out your instructions minutely.'
         text2 = 'A member of my team will execute your orders with immense precision.'
-        input = {'text': text, 'text2': text2}
-        result = ofa_pipe(input)
+        result = ofa_pipe((text, text2))
         print(result)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')

From eb6d81a53e91870ae03bb0bd4d63f996b93d51cf Mon Sep 17 00:00:00 2001
From: "baiguan.yt" <baiguan.yt@alibaba-inc.com>
Date: Mon, 10 Oct 2022 09:22:41 +0800
Subject: [PATCH 003/129] [to #42322933]add license header         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10336963

---
 modelscope/metrics/image_portrait_enhancement_metric.py         | 2 ++
 modelscope/models/cv/face_generation/op/conv2d_gradfix.py       | 2 ++
 modelscope/models/cv/face_generation/op/fused_act.py            | 2 ++
 modelscope/models/cv/face_generation/op/upfirdn2d.py            | 2 ++
 modelscope/models/cv/face_generation/stylegan2.py               | 2 ++
 modelscope/models/cv/image_colorization/unet.py                 | 2 ++
 modelscope/models/cv/image_colorization/utils.py                | 2 ++
 modelscope/models/cv/image_portrait_enhancement/align_faces.py  | 2 ++
 modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py   | 1 +
 .../models/cv/image_portrait_enhancement/eqface/model_resnet.py | 2 ++
 modelscope/models/cv/image_portrait_enhancement/gpen.py         | 2 ++
 .../cv/image_portrait_enhancement/image_portrait_enhancement.py | 1 +
 .../models/cv/image_portrait_enhancement/losses/helpers.py      | 2 ++
 .../models/cv/image_portrait_enhancement/losses/losses.py       | 2 ++
 .../models/cv/image_portrait_enhancement/losses/model_irse.py   | 2 ++
 .../cv/image_portrait_enhancement/retinaface/detection.py       | 2 ++
 .../cv/image_portrait_enhancement/retinaface/models/net.py      | 2 ++
 .../image_portrait_enhancement/retinaface/models/retinaface.py  | 2 ++
 modelscope/models/cv/super_resolution/arch_util.py              | 2 ++
 modelscope/models/cv/super_resolution/rrdbnet_arch.py           | 2 ++
 20 files changed, 38 insertions(+)

diff --git a/modelscope/metrics/image_portrait_enhancement_metric.py b/modelscope/metrics/image_portrait_enhancement_metric.py
index b8412b9e..5a81e956 100644
--- a/modelscope/metrics/image_portrait_enhancement_metric.py
+++ b/modelscope/metrics/image_portrait_enhancement_metric.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from BasicSR, publicly available at
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/models/cv/face_generation/op/conv2d_gradfix.py b/modelscope/models/cv/face_generation/op/conv2d_gradfix.py
index 661f4fc7..a3aba91f 100755
--- a/modelscope/models/cv/face_generation/op/conv2d_gradfix.py
+++ b/modelscope/models/cv/face_generation/op/conv2d_gradfix.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/conv2d_gradfix.py
 import contextlib
 import warnings
 
diff --git a/modelscope/models/cv/face_generation/op/fused_act.py b/modelscope/models/cv/face_generation/op/fused_act.py
index d6e0c10f..a24f5972 100755
--- a/modelscope/models/cv/face_generation/op/fused_act.py
+++ b/modelscope/models/cv/face_generation/op/fused_act.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# t https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py
 import os
 
 import torch
diff --git a/modelscope/models/cv/face_generation/op/upfirdn2d.py b/modelscope/models/cv/face_generation/op/upfirdn2d.py
index 5a44421d..95c987af 100755
--- a/modelscope/models/cv/face_generation/op/upfirdn2d.py
+++ b/modelscope/models/cv/face_generation/op/upfirdn2d.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py
 import os
 from collections import abc
 
diff --git a/modelscope/models/cv/face_generation/stylegan2.py b/modelscope/models/cv/face_generation/stylegan2.py
index ff9c83ee..4c650f54 100755
--- a/modelscope/models/cv/face_generation/stylegan2.py
+++ b/modelscope/models/cv/face_generation/stylegan2.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from stylegan2-pytorch,
+# made public available under the MIT License at https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
 import functools
 import math
 import operator
diff --git a/modelscope/models/cv/image_colorization/unet.py b/modelscope/models/cv/image_colorization/unet.py
index 8123651e..19f6ab62 100644
--- a/modelscope/models/cv/image_colorization/unet.py
+++ b/modelscope/models/cv/image_colorization/unet.py
@@ -1,3 +1,5 @@
+# The implementation here is modified based on DeOldify, originally MIT License
+# and publicly available at https://github.com/jantic/DeOldify/blob/master/deoldify/unet.py
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/image_colorization/utils.py b/modelscope/models/cv/image_colorization/utils.py
index 03473f90..b8968aa0 100644
--- a/modelscope/models/cv/image_colorization/utils.py
+++ b/modelscope/models/cv/image_colorization/utils.py
@@ -1,3 +1,5 @@
+# The implementation here is modified based on DeOldify, originally MIT License and
+# publicly available at https://github.com/jantic/DeOldify/blob/master/fastai/callbacks/hooks.py
 import functools
 from enum import Enum
 
diff --git a/modelscope/models/cv/image_portrait_enhancement/align_faces.py b/modelscope/models/cv/image_portrait_enhancement/align_faces.py
index 776b06d8..e6852f8c 100755
--- a/modelscope/models/cv/image_portrait_enhancement/align_faces.py
+++ b/modelscope/models/cv/image_portrait_enhancement/align_faces.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from Face-Alignment,
+# publicly available at https://github.com/foamliu/Face-Alignment/blob/master/align_faces.py
 import cv2
 import numpy as np
 from skimage import transform as trans
diff --git a/modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py b/modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py
index fe4081a4..51f2206e 100755
--- a/modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py
+++ b/modelscope/models/cv/image_portrait_enhancement/eqface/fqa.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 
 import cv2
diff --git a/modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py b/modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py
index ea3c4f2a..e0e8e9d5 100644
--- a/modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py
+++ b/modelscope/models/cv/image_portrait_enhancement/eqface/model_resnet.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from FaceQuality, made publicly available under the MIT License
+# at https://github.com/deepcam-cn/FaceQuality/blob/master/models/model_resnet.py
 import torch
 from torch import nn
 
diff --git a/modelscope/models/cv/image_portrait_enhancement/gpen.py b/modelscope/models/cv/image_portrait_enhancement/gpen.py
index 2e21dbc0..86009a41 100755
--- a/modelscope/models/cv/image_portrait_enhancement/gpen.py
+++ b/modelscope/models/cv/image_portrait_enhancement/gpen.py
@@ -1,3 +1,5 @@
+# The GPEN implementation is also open-sourced by the authors,
+# and available at https://github.com/yangxy/GPEN/blob/main/face_model/gpen_model.py
 import functools
 import itertools
 import math
diff --git a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
index 3250d393..3650ac7b 100644
--- a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
+++ b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os.path as osp
 from copy import deepcopy
diff --git a/modelscope/models/cv/image_portrait_enhancement/losses/helpers.py b/modelscope/models/cv/image_portrait_enhancement/losses/helpers.py
index 35ca202f..86f6f227 100644
--- a/modelscope/models/cv/image_portrait_enhancement/losses/helpers.py
+++ b/modelscope/models/cv/image_portrait_enhancement/losses/helpers.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from InsightFace_Pytorch,
+# made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
 from collections import namedtuple
 
 import torch
diff --git a/modelscope/models/cv/image_portrait_enhancement/losses/losses.py b/modelscope/models/cv/image_portrait_enhancement/losses/losses.py
index 8934eee7..0f5198c3 100644
--- a/modelscope/models/cv/image_portrait_enhancement/losses/losses.py
+++ b/modelscope/models/cv/image_portrait_enhancement/losses/losses.py
@@ -1,3 +1,5 @@
+# The GPEN implementation is also open-sourced by the authors,
+# and available at https://github.com/yangxy/GPEN/tree/main/training/loss/id_loss.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py b/modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py
index 3b87d7fd..00dc7c52 100644
--- a/modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py
+++ b/modelscope/models/cv/image_portrait_enhancement/losses/model_irse.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from InsightFace_Pytorch,
+# made publicly available under the MIT License at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
                       Module, PReLU, Sequential)
 
diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
index c294438a..7ad780a8 100755
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/detection.py
@@ -1,3 +1,5 @@
+# The GPEN implementation is also open-sourced by the authors,
+# and available at https://github.com/yangxy/GPEN/blob/main/face_detect/retinaface_detection.py
 import os
 
 import cv2
diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
index 0546e0bb..24451e96 100755
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
+# at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py
 import time
 
 import torch
diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
index af1d706d..64d95971 100755
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
+# at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py
 from collections import OrderedDict
 
 import torch
diff --git a/modelscope/models/cv/super_resolution/arch_util.py b/modelscope/models/cv/super_resolution/arch_util.py
index 4b87c877..99711a11 100644
--- a/modelscope/models/cv/super_resolution/arch_util.py
+++ b/modelscope/models/cv/super_resolution/arch_util.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from BasicSR, made public available under the Apache 2.0 License
+# at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/arch_util.py
 import collections.abc
 import math
 import warnings
diff --git a/modelscope/models/cv/super_resolution/rrdbnet_arch.py b/modelscope/models/cv/super_resolution/rrdbnet_arch.py
index 44947de1..8c84f796 100644
--- a/modelscope/models/cv/super_resolution/rrdbnet_arch.py
+++ b/modelscope/models/cv/super_resolution/rrdbnet_arch.py
@@ -1,3 +1,5 @@
+# The implementation is adopted from BasicSR, made public available under the Apache 2.0 License
+# at https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/archs/rrdbnet_arch.py
 import torch
 from torch import nn as nn
 from torch.nn import functional as F

From a05f37f826bbe3345506b46611e8dcd9d4059c4f Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Mon, 10 Oct 2022 09:23:18 +0800
Subject: [PATCH 004/129] ofa clip add license         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10340433

---
 modelscope/models/multi_modal/clip/__init__.py    |  2 ++
 modelscope/models/multi_modal/clip/model.py       | 15 +++++++++++++++
 modelscope/models/multi_modal/ofa/__init__.py     |  2 ++
 modelscope/models/multi_modal/ofa/resnet.py       | 14 ++++++++++++++
 .../models/multi_modal/ofa/utils/__init__.py      |  1 +
 .../ofa_for_text_to_image_synthesis_model.py      |  2 ++
 modelscope/preprocessors/ofa/utils/collate.py     |  2 ++
 modelscope/preprocessors/ofa/utils/random_help.py |  2 ++
 modelscope/trainers/multi_modal/clip/__init__.py  |  2 ++
 .../trainers/multi_modal/clip/clip_trainer.py     |  2 ++
 .../multi_modal/clip/clip_trainer_utils.py        |  2 ++
 11 files changed, 46 insertions(+)

diff --git a/modelscope/models/multi_modal/clip/__init__.py b/modelscope/models/multi_modal/clip/__init__.py
index 3fd492b9..e2e925ce 100644
--- a/modelscope/models/multi_modal/clip/__init__.py
+++ b/modelscope/models/multi_modal/clip/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .model import CLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py
index 2fb0d7e3..92d9e11a 100644
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -1,3 +1,18 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from collections import OrderedDict
 from typing import Any, Dict, Iterable, List, Tuple, Union
diff --git a/modelscope/models/multi_modal/ofa/__init__.py b/modelscope/models/multi_modal/ofa/__init__.py
index 16de7fff..3e8e59f4 100644
--- a/modelscope/models/multi_modal/ofa/__init__.py
+++ b/modelscope/models/multi_modal/ofa/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .modeling_ofa import OFADecoder, OFAEncoder, OFAModel, OFAPreTrainedModel
 from .tokenization_ofa import OFATokenizer, OFATokenizerZH
 from .tokenization_ofa_fast import OFATokenizerFast, OFATokenizerZHFast
diff --git a/modelscope/models/multi_modal/ofa/resnet.py b/modelscope/models/multi_modal/ofa/resnet.py
index de6444ab..aad0f002 100644
--- a/modelscope/models/multi_modal/ofa/resnet.py
+++ b/modelscope/models/multi_modal/ofa/resnet.py
@@ -1,3 +1,17 @@
+# Copyright 2022 OFA-Sys Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/multi_modal/ofa/utils/__init__.py b/modelscope/models/multi_modal/ofa/utils/__init__.py
index e69de29b..b937315b 100644
--- a/modelscope/models/multi_modal/ofa/utils/__init__.py
+++ b/modelscope/models/multi_modal/ofa/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
index b942e3fa..8110a0f7 100644
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/preprocessors/ofa/utils/collate.py b/modelscope/preprocessors/ofa/utils/collate.py
index a473335b..4bfa8eca 100644
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import numpy as np
 import torch
 
diff --git a/modelscope/preprocessors/ofa/utils/random_help.py b/modelscope/preprocessors/ofa/utils/random_help.py
index 77f4df3f..e0dca54e 100644
--- a/modelscope/preprocessors/ofa/utils/random_help.py
+++ b/modelscope/preprocessors/ofa/utils/random_help.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 
 try:
diff --git a/modelscope/trainers/multi_modal/clip/__init__.py b/modelscope/trainers/multi_modal/clip/__init__.py
index 87f1040c..61a6664b 100644
--- a/modelscope/trainers/multi_modal/clip/__init__.py
+++ b/modelscope/trainers/multi_modal/clip/__init__.py
@@ -1 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from .clip_trainer import CLIPTrainer
diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer.py b/modelscope/trainers/multi_modal/clip/clip_trainer.py
index cccf4296..cbe83417 100644
--- a/modelscope/trainers/multi_modal/clip/clip_trainer.py
+++ b/modelscope/trainers/multi_modal/clip/clip_trainer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Dict, Optional
 
diff --git a/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py b/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py
index 1391a4fd..4e150fe7 100644
--- a/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/clip/clip_trainer_utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import random
 

From 38a399cf38e0a1eab823cdb9f5ea3de108707b1c Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Mon, 10 Oct 2022 09:24:36 +0800
Subject: [PATCH 005/129] [to #42322933] format outputs for movie scene
 segmentation demo         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10328357

---
 .../cv/movie_scene_segmentation/model.py      | 21 ++++++++-------
 .../movie_scene_segmentation/utils/save_op.py | 12 +++++++--
 modelscope/outputs.py                         | 27 ++++++++++++++-----
 .../cv/movie_scene_segmentation_pipeline.py   |  9 ++++---
 4 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
index 1232d427..8117961a 100644
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -67,7 +67,6 @@ class MovieSceneSegmentationModel(TorchModel):
                 mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
 
-        self.infer_result = {'vid': [], 'sid': [], 'pred': []}
         sampling_method = self.cfg.dataset.sampling_method.name
         self.neighbor_size = self.cfg.dataset.sampling_method.params[
             sampling_method].neighbor_size
@@ -104,6 +103,8 @@ class MovieSceneSegmentationModel(TorchModel):
         shot_num = len(sids)
         cnt = shot_num // bs + 1
 
+        infer_sid, infer_pred = [], []
+        infer_result = {}
         for i in range(cnt):
             start = i * bs
             end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
@@ -112,13 +113,14 @@ class MovieSceneSegmentationModel(TorchModel):
             input_ = torch.stack(input_)
             outputs = self.shared_step(input_)  # shape [b,2]
             prob = F.softmax(outputs, dim=1)
-            self.infer_result['sid'].extend(sid_.cpu().detach().numpy())
-            self.infer_result['pred'].extend(prob[:, 1].cpu().detach().numpy())
-        self.infer_result['pred'] = np.stack(self.infer_result['pred'])
+            infer_sid.extend(sid_.cpu().detach().numpy())
+            infer_pred.extend(prob[:, 1].cpu().detach().numpy())
+        infer_result.update({'pred': np.stack(infer_pred)})
+        infer_result.update({'sid': infer_sid})
 
-        assert len(self.infer_result['sid']) == len(sids)
-        assert len(self.infer_result['pred']) == len(inputs)
-        return self.infer_result
+        assert len(infer_result['sid']) == len(sids)
+        assert len(infer_result['pred']) == len(inputs)
+        return infer_result
 
     def shared_step(self, inputs):
         with torch.no_grad():
@@ -162,11 +164,12 @@ class MovieSceneSegmentationModel(TorchModel):
         thres = self.cfg.pipeline.save_threshold
 
         anno_dict = get_pred_boundary(pred_dict, thres)
-        scene_dict_lst, scene_list = pred2scene(self.shot2keyf, anno_dict)
+        scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
+            self.shot2keyf, anno_dict)
         if self.cfg.pipeline.save_split_scene:
             re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
             print(f'Split scene video saved to {re_dir}')
-        return len(scene_list), scene_dict_lst
+        return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
 
     def preprocess(self, inputs):
         logger.info('Begin shot detect......')
diff --git a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
index b350ff13..3339e1a3 100644
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -22,15 +22,23 @@ def pred2scene(shot2keyf, anno_dict):
     scene_list, pair_list = get_demo_scene_list(shot2keyf, anno_dict)
 
     scene_dict_lst = []
+    shot_num = len(shot2keyf)
+    shot_dict_lst = []
+    for item in shot2keyf:
+        tmp = item.split(' ')
+        shot_dict_lst.append({
+            'frame': [tmp[0], tmp[1]],
+            'timestamps': [tmp[-2], tmp[-1]]
+        })
     assert len(scene_list) == len(pair_list)
     for scene_ind, scene_item in enumerate(scene_list):
         scene_dict_lst.append({
             'shot': pair_list[scene_ind],
             'frame': scene_item[0],
-            'timestamp': scene_item[1]
+            'timestamps': scene_item[1]
         })
 
-    return scene_dict_lst, scene_list
+    return scene_dict_lst, scene_list, shot_num, shot_dict_lst
 
 
 def scene2video(source_movie_fn, scene_list, thres):
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index d8d2458a..717ff4dd 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -38,8 +38,10 @@ class OutputKeys(object):
     KWS_LIST = 'kws_list'
     HISTORY = 'history'
     TIMESTAMPS = 'timestamps'
-    SPLIT_VIDEO_NUM = 'split_video_num'
-    SPLIT_META_LIST = 'split_meta_list'
+    SHOT_NUM = 'shot_num'
+    SCENE_NUM = 'scene_num'
+    SCENE_META_LIST = 'scene_meta_list'
+    SHOT_META_LIST = 'shot_meta_list'
 
 
 TASK_OUTPUTS = {
@@ -309,19 +311,30 @@ TASK_OUTPUTS = {
     Tasks.shop_segmentation: [OutputKeys.MASKS],
     # movide scene segmentation result for a single video
     # {
-    #        "split_video_num":3,
-    #        "split_meta_list":
+    #        "shot_num":15,
+    #        "shot_meta_list":
+    #        [
+    #           {
+    #               "frame": [start_frame, end_frame],
+    #               "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
+    #
+    #           }
+    #         ]
+    #        "scene_num":3,
+    #        "scene_meta_list":
     #        [
     #           {
     #               "shot": [0,1,2],
     #               "frame": [start_frame, end_frame],
-    #               "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
+    #               "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
     #           }
     #        ]
     #
     # }
-    Tasks.movie_scene_segmentation:
-    [OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
+    Tasks.movie_scene_segmentation: [
+        OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM,
+        OutputKeys.SCENE_META_LIST
+    ],
 
     # ============ nlp tasks ===================
 
diff --git a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
index 6704e4c0..3fffc546 100644
--- a/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/movie_scene_segmentation_pipeline.py
@@ -60,9 +60,12 @@ class MovieSceneSegmentationPipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         data = {'input_video_pth': self.input_video_pth, 'feat': inputs}
-        video_num, meta_lst = self.model.postprocess(data)
+        scene_num, scene_meta_lst, shot_num, shot_meta_lst = self.model.postprocess(
+            data)
         result = {
-            OutputKeys.SPLIT_VIDEO_NUM: video_num,
-            OutputKeys.SPLIT_META_LIST: meta_lst
+            OutputKeys.SHOT_NUM: shot_num,
+            OutputKeys.SHOT_META_LIST: shot_meta_lst,
+            OutputKeys.SCENE_NUM: scene_num,
+            OutputKeys.SCENE_META_LIST: scene_meta_lst
         }
         return result

From 42896de7098a22c503e84a4eb7e377a1450ec29c Mon Sep 17 00:00:00 2001
From: "moyu.wk" <moyu.wk@alibaba-inc.com>
Date: Mon, 10 Oct 2022 11:36:39 +0800
Subject: [PATCH 006/129] [to #42322933]add pipelines for APE\QE\Domain
 Classifications

Add pipelines for the following new models:
- [Multilingual Quality Estimation](https://modelscope.cn/models/damo/nlp_translation_quality_estimation_multilingual/summary)
- [Automatic Post-Editing (En-De)](https://modelscope.cn/models/damo/nlp_automatic_post_editing_for_translation_en2de/summary)
- [Domain classification (Zh)](https://modelscope.cn/models/damo/nlp_domain_classification_chinese/summary)
- [Style classification (Zh)](https://modelscope.cn/models/damo/nlp_style_classification_chinese/summary)
- [Style classification (En)](https://modelscope.cn/models/damo/nlp_style_classification_english/summary)
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10315370
---
 modelscope/metainfo.py                        |   3 +
 modelscope/pipelines/nlp/__init__.py          |  14 +-
 .../nlp/automatic_post_editing_pipeline.py    | 158 ++++++++++++++++++
 ...sttext_sequence_classification_pipeline.py |  69 ++++++++
 ...translation_quality_estimation_pipeline.py |  72 ++++++++
 modelscope/utils/constant.py                  |   1 +
 requirements/nlp.txt                          |   3 +
 .../pipelines/test_automatic_post_editing.py  |  30 ++++
 tests/pipelines/test_domain_classification.py |  45 +++++
 .../test_translation_quality_estimation.py    |  32 ++++
 10 files changed, 424 insertions(+), 3 deletions(-)
 create mode 100644 modelscope/pipelines/nlp/automatic_post_editing_pipeline.py
 create mode 100644 modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py
 create mode 100644 modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
 create mode 100644 tests/pipelines/test_automatic_post_editing.py
 create mode 100644 tests/pipelines/test_domain_classification.py
 create mode 100644 tests/pipelines/test_translation_quality_estimation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 33273502..28804ce6 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -189,6 +189,9 @@ class Pipelines(object):
     product_segmentation = 'product-segmentation'
 
     # nlp tasks
+    automatic_post_editing = 'automatic-post-editing'
+    translation_quality_estimation = 'translation-quality-estimation'
+    domain_classification = 'domain-classification'
     sentence_similarity = 'sentence-similarity'
     word_segmentation = 'word-segmentation'
     part_of_speech = 'part-of-speech'
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 5267b5b2..be854593 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -4,12 +4,13 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
+    from .automatic_post_editing_pipeline import AutomaticPostEditingPipeline
     from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline
-    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
     from .document_segmentation_pipeline import DocumentSegmentationPipeline
+    from .fasttext_sequence_classification_pipeline import FasttextSequenceClassificationPipeline
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
@@ -20,6 +21,8 @@ if TYPE_CHECKING:
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
+    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
+    from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline
     from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline
@@ -31,14 +34,15 @@ if TYPE_CHECKING:
 
 else:
     _import_structure = {
+        'automatic_post_editing_pipeline': ['AutomaticPostEditingPipeline'],
         'conversational_text_to_sql_pipeline':
         ['ConversationalTextToSqlPipeline'],
-        'table_question_answering_pipeline':
-        ['TableQuestionAnsweringPipeline'],
         'dialog_intent_prediction_pipeline':
         ['DialogIntentPredictionPipeline'],
         'dialog_modeling_pipeline': ['DialogModelingPipeline'],
         'dialog_state_tracking_pipeline': ['DialogStateTrackingPipeline'],
+        'domain_classification_pipeline':
+        ['FasttextSequenceClassificationPipeline'],
         'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
         'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
         'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
@@ -51,12 +55,16 @@ else:
         'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
+        'table_question_answering_pipeline':
+        ['TableQuestionAnsweringPipeline'],
         'text_classification_pipeline': ['TextClassificationPipeline'],
         'text_error_correction_pipeline': ['TextErrorCorrectionPipeline'],
         'text_generation_pipeline': ['TextGenerationPipeline'],
         'text2text_generation_pipeline': ['Text2TextGenerationPipeline'],
         'token_classification_pipeline': ['TokenClassificationPipeline'],
         'translation_pipeline': ['TranslationPipeline'],
+        'translation_quality_estimation_pipeline':
+        ['TranslationQualityEstimationPipeline'],
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
diff --git a/modelscope/pipelines/nlp/automatic_post_editing_pipeline.py b/modelscope/pipelines/nlp/automatic_post_editing_pipeline.py
new file mode 100644
index 00000000..83968586
--- /dev/null
+++ b/modelscope/pipelines/nlp/automatic_post_editing_pipeline.py
@@ -0,0 +1,158 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from html import unescape
+from typing import Any, Dict
+
+import jieba
+import numpy as np
+import tensorflow as tf
+from sacremoses import (MosesDetokenizer, MosesDetruecaser,
+                        MosesPunctNormalizer, MosesTokenizer, MosesTruecaser)
+from sentencepiece import SentencePieceProcessor
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+    tf.disable_eager_execution()
+
+logger = get_logger()
+
+__all__ = ['AutomaticPostEditingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.translation, module_name=Pipelines.automatic_post_editing)
+class AutomaticPostEditingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """Build an automatic post editing pipeline with a model dir.
+
+        @param model: Model path for saved pb file
+        """
+        super().__init__(model=model, **kwargs)
+        export_dir = model
+        self.cfg = Config.from_file(
+            os.path.join(export_dir, ModelFile.CONFIGURATION))
+        joint_vocab_file = os.path.join(
+            export_dir, self.cfg[ConfigFields.preprocessor]['vocab'])
+        self.vocab = dict([(w.strip(), i) for i, w in enumerate(
+            open(joint_vocab_file, 'r', encoding='utf8'))])
+        self.vocab_reverse = dict([(i, w.strip()) for i, w in enumerate(
+            open(joint_vocab_file, 'r', encoding='utf8'))])
+        self.unk_id = self.cfg[ConfigFields.preprocessor].get('unk_id', -1)
+        strip_unk = self.cfg.get(ConfigFields.postprocessor,
+                                 {}).get('strip_unk', True)
+        self.unk_token = '' if strip_unk else self.cfg.get(
+            ConfigFields.postprocessor, {}).get('unk_token', '<unk>')
+        if self.unk_id == -1:
+            self.unk_id = len(self.vocab) - 1
+        tf.reset_default_graph()
+        tf_config = tf.ConfigProto(allow_soft_placement=True)
+        tf_config.gpu_options.allow_growth = True
+        self._session = tf.Session(config=tf_config)
+        tf.saved_model.loader.load(
+            self._session, [tf.python.saved_model.tag_constants.SERVING],
+            export_dir)
+        default_graph = tf.get_default_graph()
+        self.input_src_id_placeholder = default_graph.get_tensor_by_name(
+            'Placeholder:0')
+        self.input_src_len_placeholder = default_graph.get_tensor_by_name(
+            'Placeholder_1:0')
+        self.input_mt_id_placeholder = default_graph.get_tensor_by_name(
+            'Placeholder_2:0')
+        self.input_mt_len_placeholder = default_graph.get_tensor_by_name(
+            'Placeholder_3:0')
+        output_id_beam = default_graph.get_tensor_by_name(
+            'enc2enc/decoder/transpose:0')
+        output_len_beam = default_graph.get_tensor_by_name(
+            'enc2enc/decoder/Minimum:0')
+        output_id = tf.cast(
+            tf.map_fn(lambda x: x[0], output_id_beam), dtype=tf.int64)
+        output_len = tf.map_fn(lambda x: x[0], output_len_beam)
+        self.output = {'output_ids': output_id, 'output_lens': output_len}
+        init = tf.global_variables_initializer()
+        local_init = tf.local_variables_initializer()
+        self._session.run([init, local_init])
+        tf.saved_model.loader.load(
+            self._session, [tf.python.saved_model.tag_constants.SERVING],
+            export_dir)
+
+        # preprocess
+        self._src_lang = self.cfg[ConfigFields.preprocessor]['src_lang']
+        self._tgt_lang = self.cfg[ConfigFields.preprocessor]['tgt_lang']
+        tok_escape = self.cfg[ConfigFields.preprocessor].get(
+            'tokenize_escape', False)
+        src_tokenizer = MosesTokenizer(lang=self._src_lang)
+        mt_tokenizer = MosesTokenizer(lang=self._tgt_lang)
+        truecase_model = os.path.join(
+            export_dir, self.cfg[ConfigFields.preprocessor]['truecaser'])
+        truecaser = MosesTruecaser(load_from=truecase_model)
+        sp_model = os.path.join(
+            export_dir, self.cfg[ConfigFields.preprocessor]['sentencepiece'])
+        sp = SentencePieceProcessor()
+        sp.load(sp_model)
+
+        self.src_preprocess = lambda x: ' '.join(
+            sp.encode_as_pieces(
+                truecaser.truecase(
+                    src_tokenizer.tokenize(
+                        x, return_str=True, escape=tok_escape),
+                    return_str=True)))
+        self.mt_preprocess = lambda x: ' '.join(
+            sp.encode_as_pieces(
+                truecaser.truecase(
+                    mt_tokenizer.tokenize(
+                        x, return_str=True, escape=tok_escape),
+                    return_str=True)))
+
+        # post process, de-bpe, de-truecase, detok
+        detruecaser = MosesDetruecaser()
+        detokenizer = MosesDetokenizer(lang=self._tgt_lang)
+        self.postprocess_fun = lambda x: detokenizer.detokenize(
+            detruecaser.detruecase(
+                x.replace(' ▁', '@@').replace(' ', '').replace('@@', ' ').
+                strip()[1:],
+                return_str=True).split())
+
+    def preprocess(self, input: str) -> Dict[str, Any]:
+        src, mt = input.split('\005', 1)
+        src_sp, mt_sp = self.src_preprocess(src), self.mt_preprocess(mt)
+        input_src_ids = np.array(
+            [[self.vocab.get(w, self.unk_id) for w in src_sp.strip().split()]])
+        input_mt_ids = np.array(
+            [[self.vocab.get(w, self.unk_id) for w in mt_sp.strip().split()]])
+        input_src_lens = [len(x) for x in input_src_ids]
+        input_mt_lens = [len(x) for x in input_mt_ids]
+        feed_dict = {
+            self.input_src_id_placeholder: input_src_ids,
+            self.input_mt_id_placeholder: input_mt_ids,
+            self.input_src_len_placeholder: input_src_lens,
+            self.input_mt_len_placeholder: input_mt_lens
+        }
+        return feed_dict
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with self._session.as_default():
+            sess_outputs = self._session.run(self.output, feed_dict=input)
+            return sess_outputs
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_ids, output_len = inputs['output_ids'][0], inputs[
+            'output_lens'][0]
+        output_ids = output_ids[:output_len - 1]  # -1 for </s>
+        output_tokens = ' '.join([
+            self.vocab_reverse.get(wid, self.unk_token) for wid in output_ids
+        ])
+        post_editing_output = self.postprocess_fun(output_tokens)
+        result = {OutputKeys.TRANSLATION: post_editing_output}
+        return result
diff --git a/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py b/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py
new file mode 100644
index 00000000..f10af88f
--- /dev/null
+++ b/modelscope/pipelines/nlp/fasttext_sequence_classification_pipeline.py
@@ -0,0 +1,69 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import sentencepiece
+from fasttext import load_model
+from fasttext.FastText import _FastText
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['FasttextSequenceClassificationPipeline']
+
+
+def sentencepiece_tokenize(sp_model, sent):
+    tokens = []
+    for t in sp_model.EncodeAsPieces(sent):
+        s = t.strip()
+        if s:
+            tokens.append(s)
+    return ' '.join(tokens)
+
+
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.domain_classification)
+class FasttextSequenceClassificationPipeline(Pipeline):
+
+    def __init__(self, model: Union[str, _FastText], **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model: a model directory including model.bin and spm.model
+            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+        """
+        super().__init__(model=model)
+        model_file = os.path.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
+        spm_file = os.path.join(model, 'sentencepiece.model')
+        assert os.path.isdir(model) and os.path.exists(model_file) and os.path.exists(spm_file), \
+            '`model` should be a directory contains `model.bin` and `sentencepiece.model`'
+        self.model = load_model(model_file)
+        self.spm = sentencepiece.SentencePieceProcessor()
+        self.spm.Load(spm_file)
+
+    def preprocess(self, inputs: str) -> Dict[str, Any]:
+        text = inputs.strip()
+        text_sp = sentencepiece_tokenize(self.spm, text)
+        return {'text_sp': text_sp, 'text': text}
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        topk = inputs.get('topk', -1)
+        label, probs = self.model.predict(inputs['text_sp'], k=topk)
+        label = [x.replace('__label__', '') for x in label]
+        result = {
+            OutputKeys.LABEL: label[0],
+            OutputKeys.SCORE: probs[0],
+            OutputKeys.LABELS: label,
+            OutputKeys.SCORES: probs
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
new file mode 100644
index 00000000..6ef203b9
--- /dev/null
+++ b/modelscope/pipelines/nlp/translation_quality_estimation_pipeline.py
@@ -0,0 +1,72 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+from transformers import XLMRobertaTokenizer
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.nlp import BertForSequenceClassification
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['TranslationQualityEstimationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentence_similarity,
+    module_name=Pipelines.translation_quality_estimation)
+class TranslationQualityEstimationPipeline(Pipeline):
+
+    def __init__(self, model: str, device: str = 'gpu', **kwargs):
+        super().__init__(model=model, device=device)
+        model_file = os.path.join(model, ModelFile.TORCH_MODEL_FILE)
+        with open(model_file, 'rb') as f:
+            buffer = io.BytesIO(f.read())
+        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model)
+        self.model = torch.jit.load(
+            buffer, map_location=self.device).to(self.device)
+
+    def preprocess(self, inputs: Dict[str, Any]):
+        src_text = inputs['source_text'].strip()
+        tgt_text = inputs['target_text'].strip()
+        encoded_inputs = self.tokenizer.batch_encode_plus(
+            [[src_text, tgt_text]],
+            return_tensors='pt',
+            padding=True,
+            truncation=True)
+        input_ids = encoded_inputs['input_ids'].to(self.device)
+        attention_mask = encoded_inputs['attention_mask'].to(self.device)
+        inputs.update({
+            'input_ids': input_ids,
+            'attention_mask': attention_mask
+        })
+        return inputs
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if 'input_ids' not in inputs:
+            inputs = self.preprocess(inputs)
+        res = self.model(inputs['input_ids'], inputs['attention_mask'])
+        result = {
+            OutputKeys.LABELS: '-1',
+            OutputKeys.SCORES: res[0].detach().squeeze().tolist()
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): input data dict
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        return inputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 7968fcd1..5bc27c03 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -267,6 +267,7 @@ class ConfigFields(object):
     preprocessor = 'preprocessor'
     train = 'train'
     evaluation = 'evaluation'
+    postprocessor = 'postprocessor'
 
 
 class ConfigKeys(object):
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 15f2f41a..f18dde2e 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,7 +1,10 @@
 en_core_web_sm>=2.3.5
+fasttext
 jieba>=0.42.1
 megatron_util
 pai-easynlp
+# “protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.”
+protobuf>=3.19.0,<3.21.0
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
diff --git a/tests/pipelines/test_automatic_post_editing.py b/tests/pipelines/test_automatic_post_editing.py
new file mode 100644
index 00000000..da09851c
--- /dev/null
+++ b/tests/pipelines/test_automatic_post_editing.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class AutomaticPostEditingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.translation
+        self.model_id = 'damo/nlp_automatic_post_editing_for_translation_en2de'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2de(self):
+        inputs = 'Simultaneously, the Legion took part to the pacification of Algeria, plagued by various tribal ' \
+                 'rebellions and razzias.\005Gleichzeitig nahm die Legion an der Befriedung Algeriens teil, die von ' \
+                 'verschiedenen Stammesaufständen und Rasias heimgesucht wurde.'
+        pipeline_ins = pipeline(self.task, model=self.model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_domain_classification.py b/tests/pipelines/test_domain_classification.py
new file mode 100644
index 00000000..8e5bfa7f
--- /dev/null
+++ b/tests/pipelines/test_domain_classification.py
@@ -0,0 +1,45 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DomainClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_classification
+        self.model_id = 'damo/nlp_domain_classification_chinese'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_zh_domain(self):
+        inputs = '通过这种方式产生的离子吸收大地水分之后,可以通过潮解作用,将活性电解离子有效释放到周围土壤中,使接地极成为一个离子发生装置,' \
+                 '从而改善周边土质使之达到接地要求。'
+        pipeline_ins = pipeline(self.task, model=self.model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_zh_style(self):
+        model_id = 'damo/nlp_style_classification_chinese'
+        inputs = '通过这种方式产生的离子吸收大地水分之后,可以通过潮解作用,将活性电解离子有效释放到周围土壤中,使接地极成为一个离子发生装置,' \
+                 '从而改善周边土质使之达到接地要求。'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en_style(self):
+        model_id = 'damo/nlp_style_classification_english'
+        inputs = 'High Power 11.1V 5200mAh Lipo Battery For RC Car Robot Airplanes ' \
+                 'Helicopter RC Drone Parts 3s Lithium battery 11.1v Battery'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_translation_quality_estimation.py b/tests/pipelines/test_translation_quality_estimation.py
new file mode 100644
index 00000000..315fa72b
--- /dev/null
+++ b/tests/pipelines/test_translation_quality_estimation.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class TranslationQualityEstimationTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.sentence_similarity
+        self.model_id = 'damo/nlp_translation_quality_estimation_multilingual'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2zh(self):
+        inputs = {
+            'source_text': 'Love is a losing game',
+            'target_text': '宝贝，人和人一场游戏'
+        }
+        pipeline_ins = pipeline(self.task, model=self.model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From b151056b4e6d79baa95e2a7c43e312af2ef14440 Mon Sep 17 00:00:00 2001
From: "lingchen.zlm" <lingchen.zlm@alibaba-inc.com>
Date: Mon, 10 Oct 2022 14:39:21 +0800
Subject: [PATCH 007/129] [to #42322933][bug fix] convert gemm output torch
 tensor to numpy array for demo support         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10342977

---
 modelscope/models/multi_modal/gemm/gemm_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py
index 09ef2480..806c469c 100644
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -543,6 +543,7 @@ class GEMMModel(nn.Module):
         img_feature, text_feature, caption = None, None, None
         if captioning and image is not None:
             img_feature, caption = self.model.image_to_text(image)
+            img_feature = self.parse_feat(img_feature)
         elif image is not None:
             img_feature = self.parse_feat(self.model.encode_image(image))
         if text is not None:

From ff69439c4f48bbd1ca3e3f81a3c921925f8e3ca5 Mon Sep 17 00:00:00 2001
From: "ryan.yy" <ryan.yy@alibaba-inc.com>
Date: Mon, 10 Oct 2022 17:42:41 +0800
Subject: [PATCH 008/129] [to #42322933]add image_body_reshaping code        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10217723

    * add image_body_reshaping code
---
 data/test/images/image_body_reshaping.jpg     |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/image_body_reshaping/__init__.py       |  20 +
 .../image_body_reshaping.py                   | 128 +++++
 .../models/cv/image_body_reshaping/model.py   | 189 +++++++
 .../cv/image_body_reshaping/person_info.py    | 339 ++++++++++++
 .../pose_estimator/__init__.py                |   0
 .../pose_estimator/body.py                    | 272 ++++++++++
 .../pose_estimator/model.py                   | 141 +++++
 .../pose_estimator/util.py                    |  33 ++
 .../cv/image_body_reshaping/slim_utils.py     | 507 ++++++++++++++++++
 modelscope/outputs.py                         |   1 +
 modelscope/pipelines/builder.py               |   2 +
 .../cv/image_body_reshaping_pipeline.py       |  40 ++
 modelscope/utils/constant.py                  |   2 +-
 requirements/cv.txt                           |   1 +
 tests/pipelines/test_image_body_reshaping.py  |  58 ++
 17 files changed, 1737 insertions(+), 1 deletion(-)
 create mode 100644 data/test/images/image_body_reshaping.jpg
 create mode 100644 modelscope/models/cv/image_body_reshaping/__init__.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/model.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/person_info.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
 create mode 100644 modelscope/models/cv/image_body_reshaping/slim_utils.py
 create mode 100644 modelscope/pipelines/cv/image_body_reshaping_pipeline.py
 create mode 100644 tests/pipelines/test_image_body_reshaping.py

diff --git a/data/test/images/image_body_reshaping.jpg b/data/test/images/image_body_reshaping.jpg
new file mode 100644
index 00000000..d78acb8f
--- /dev/null
+++ b/data/test/images/image_body_reshaping.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d
+size 1127557
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 28804ce6..1b8c4720 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -43,6 +43,7 @@ class Models(object):
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
+    image_body_reshaping = 'image-body-reshaping'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -187,6 +188,7 @@ class Pipelines(object):
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
+    image_body_reshaping = 'flow-based-body-reshaping'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
diff --git a/modelscope/models/cv/image_body_reshaping/__init__.py b/modelscope/models/cv/image_body_reshaping/__init__.py
new file mode 100644
index 00000000..a04f110d
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_body_reshaping import ImageBodyReshaping
+
+else:
+    _import_structure = {'image_body_reshaping': ['ImageBodyReshaping']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
new file mode 100644
index 00000000..4aed8d98
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py
@@ -0,0 +1,128 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .model import FlowGenerator
+from .person_info import PersonInfo
+from .pose_estimator.body import Body
+from .slim_utils import image_warp_grid1, resize_on_long_side
+
+logger = get_logger()
+
+__all__ = ['ImageBodyReshaping']
+
+
+@MODELS.register_module(
+    Tasks.image_body_reshaping, module_name=Models.image_body_reshaping)
+class ImageBodyReshaping(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image body reshaping model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+
+        self.degree = 1.0
+        self.reshape_model = FlowGenerator(n_channels=16).to(self.device)
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoints = torch.load(model_path, map_location=torch.device('cpu'))
+        self.reshape_model.load_state_dict(
+            checkpoints['state_dict'], strict=True)
+        self.reshape_model.eval()
+        logger.info('load body reshaping model done')
+
+        pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth')
+        self.pose_esti = Body(pose_model_ckpt, self.device)
+        logger.info('load pose model done')
+
+    def pred_joints(self, img):
+        if img is None:
+            return None
+        small_src, resize_scale = resize_on_long_side(img, 300)
+        body_joints = self.pose_esti(small_src)
+
+        if body_joints.shape[0] >= 1:
+            body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale
+
+        return body_joints
+
+    def pred_flow(self, img):
+
+        body_joints = self.pred_joints(img)
+        small_size = 1200
+
+        if img.shape[0] > small_size or img.shape[1] > small_size:
+            _img, _scale = resize_on_long_side(img, small_size)
+            body_joints[:, :, :2] = body_joints[:, :, :2] * _scale
+        else:
+            _img = img
+
+        # We only reshape one person
+        if body_joints.shape[0] < 1 or body_joints.shape[0] > 1:
+            return None
+
+        person = PersonInfo(body_joints[0])
+
+        with torch.no_grad():
+            person_pred = person.pred_flow(_img, self.reshape_model,
+                                           self.device)
+
+        flow = np.dstack((person_pred['rDx'], person_pred['rDy']))
+
+        scale = img.shape[0] * 1.0 / flow.shape[0]
+
+        flow = cv2.resize(flow, (img.shape[1], img.shape[0]))
+        flow *= scale
+
+        return flow
+
+    def warp(self, src_img, flow):
+
+        X_flow = flow[..., 0]
+        Y_flow = flow[..., 1]
+
+        X_flow = np.ascontiguousarray(X_flow)
+        Y_flow = np.ascontiguousarray(Y_flow)
+
+        pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0)
+        return pred
+
+    def inference(self, img):
+        img = img.cpu().numpy()
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        flow = self.pred_flow(img)
+
+        if flow is None:
+            return img
+
+        assert flow.shape[:2] == img.shape[:2]
+
+        mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8)
+        mag -= 3
+        mag[mag <= 0] = 0
+
+        x, y = cv2.polarToCart(mag, ang, angleInDegrees=False)
+        flow = np.dstack((x, y))
+
+        flow *= self.degree
+        pred = self.warp(img, flow)
+        out_img = np.clip(pred, 0, 255)
+        logger.info('model inference done')
+
+        return out_img.astype(np.uint8)
diff --git a/modelscope/models/cv/image_body_reshaping/model.py b/modelscope/models/cv/image_body_reshaping/model.py
new file mode 100644
index 00000000..174428a1
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/model.py
@@ -0,0 +1,189 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvLayer(nn.Module):
+
+    def __init__(self, in_ch, out_ch):
+        super(ConvLayer, self).__init__()
+
+        self.conv = nn.Sequential(
+            nn.ReflectionPad2d(1),
+            nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0),
+            nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class SASA(nn.Module):
+
+    def __init__(self, in_dim):
+        super(SASA, self).__init__()
+        self.chanel_in = in_dim
+
+        self.query_conv = nn.Conv2d(
+            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
+        self.key_conv = nn.Conv2d(
+            in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
+        self.value_conv = nn.Conv2d(
+            in_channels=in_dim, out_channels=in_dim, kernel_size=1)
+        self.mag_conv = nn.Conv2d(
+            in_channels=5, out_channels=in_dim // 32, kernel_size=1)
+
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        self.softmax = nn.Softmax(dim=-1)  #
+        self.sigmoid = nn.Sigmoid()
+
+    def structure_encoder(self, paf_mag, target_height, target_width):
+        torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True)
+        torso_mask = torch.clamp(torso_mask, 0, 1)
+
+        arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True)
+        arms_mask = torch.clamp(arms_mask, 0, 1)
+
+        legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True)
+        legs_mask = torch.clamp(legs_mask, 0, 1)
+
+        fg_mask = paf_mag[:, 12, :, :].unsqueeze(1)
+        bg_mask = 1 - fg_mask
+        Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask),
+                      dim=1)
+        Y = F.interpolate(Y, size=(target_height, target_width), mode='area')
+        return Y
+
+    def forward(self, X, PAF_mag):
+        """extract self-attention features.
+        Args:
+            X : input feature maps( B x C x H x W)
+            PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity
+
+        Returns:
+            out : self attention value + input feature
+            Y: B X N X N (N is Width*Height)
+        """
+
+        m_batchsize, C, height, width = X.size()
+
+        Y = self.structure_encoder(PAF_mag, height, width)
+
+        connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1,
+                                                      width * height)
+        affinity = torch.bmm(
+            connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec)
+        affinity_centered = affinity - torch.mean(affinity)
+        affinity_sigmoid = self.sigmoid(affinity_centered)
+
+        proj_query = self.query_conv(X).view(m_batchsize, -1,
+                                             width * height).permute(0, 2, 1)
+        proj_key = self.key_conv(X).view(m_batchsize, -1, width * height)
+        selfatten_map = torch.bmm(proj_query, proj_key)
+        selfatten_centered = selfatten_map - torch.mean(
+            selfatten_map)  # centering
+        selfatten_sigmoid = self.sigmoid(selfatten_centered)
+
+        SASA_map = selfatten_sigmoid * affinity_sigmoid
+
+        proj_value = self.value_conv(X).view(m_batchsize, -1, width * height)
+
+        out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1))
+        out = out.view(m_batchsize, C, height, width)
+
+        out = self.gamma * out + X
+        return out, Y
+
+
+class FlowGenerator(nn.Module):
+
+    def __init__(self, n_channels, deep_supervision=False):
+        super(FlowGenerator, self).__init__()
+        self.deep_supervision = deep_supervision
+
+        self.Encoder = nn.Sequential(
+            ConvLayer(n_channels, 64),
+            ConvLayer(64, 64),
+            nn.MaxPool2d(2),
+            ConvLayer(64, 128),
+            ConvLayer(128, 128),
+            nn.MaxPool2d(2),
+            ConvLayer(128, 256),
+            ConvLayer(256, 256),
+            nn.MaxPool2d(2),
+            ConvLayer(256, 512),
+            ConvLayer(512, 512),
+            nn.MaxPool2d(2),
+            ConvLayer(512, 1024),
+            ConvLayer(1024, 1024),
+            ConvLayer(1024, 1024),
+            ConvLayer(1024, 1024),
+            ConvLayer(1024, 1024),
+        )
+
+        self.SASA = SASA(in_dim=1024)
+
+        self.Decoder = nn.Sequential(
+            ConvLayer(1024, 1024),
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            ConvLayer(1024, 512),
+            ConvLayer(512, 512),
+            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+            ConvLayer(512, 256),
+            ConvLayer(256, 256),
+            ConvLayer(256, 128),
+            ConvLayer(128, 64),
+            ConvLayer(64, 32),
+            nn.Conv2d(32, 2, kernel_size=1, padding=0),
+            nn.Tanh(),
+            nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True),
+        )
+
+        dilation_ksize = 17
+        self.dilation = torch.nn.MaxPool2d(
+            kernel_size=dilation_ksize,
+            stride=1,
+            padding=int((dilation_ksize - 1) / 2))
+
+    def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2):
+        n, c, h, w = x.size()
+        yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
+        xv = xv.float() / (w - 1) * 2.0 - 1
+        yv = yv.float() / (h - 1) * 2.0 - 1
+        grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0)
+        grid = grid.to(flow.device)
+        grid_x = grid + 2 * flow * coff
+        warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode)
+        return warp_x
+
+    def forward(self, img, skeleton_map, coef=0.2):
+        """extract self-attention features.
+        Args:
+            img : input numpy image
+            skeleton_map : skeleton map of input image
+            coef: warp degree
+
+        Returns:
+            warp_x : warped image
+            flow: predicted flow
+        """
+
+        img_concat = torch.cat((img, skeleton_map), dim=1)
+        X = self.Encoder(img_concat)
+
+        _, _, height, width = X.size()
+
+        # directly get PAF magnitude from skeleton maps via dilation
+        PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5)
+
+        out, Y = self.SASA(X, PAF_mag)
+        flow = self.Decoder(out)
+
+        flow = flow.permute(0, 2, 3, 1)  # [n, 2, h, w] ==> [n, h, w, 2]
+
+        warp_x = self.warp(img, flow, coff=coef)
+        warp_x = torch.clamp(warp_x, min=-1.0, max=1.0)
+
+        return warp_x, flow
diff --git a/modelscope/models/cv/image_body_reshaping/person_info.py b/modelscope/models/cv/image_body_reshaping/person_info.py
new file mode 100644
index 00000000..509a2ce3
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/person_info.py
@@ -0,0 +1,339 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+
+import cv2
+import numpy as np
+import torch
+
+from .slim_utils import (enlarge_box_tblr, gen_skeleton_map,
+                         get_map_fusion_map_cuda, get_mask_bbox,
+                         resize_on_long_side)
+
+
+class PersonInfo(object):
+
+    def __init__(self, joints):
+        self.joints = joints
+        self.flow = None
+        self.pad_boder = False
+        self.height_expand = 0
+        self.width_expand = 0
+        self.coeff = 0.2
+        self.network_input_W = 256
+        self.network_input_H = 256
+        self.divider = 20
+        self.flow_scales = ['upper_2']
+
+    def update_attribute(self, pad_boder, height_expand, width_expand):
+        self.pad_boder = pad_boder
+        self.height_expand = height_expand
+        self.width_expand = width_expand
+        if pad_boder:
+            self.joints[:, 0] += width_expand
+            self.joints[:, 1] += height_expand
+
+    def pred_flow(self, img, flow_net, device):
+        with torch.no_grad():
+            if img is None:
+                print('image is none')
+                self.flow = None
+
+            if len(img.shape) == 2:
+                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+            if self.pad_boder:
+                height_expand = self.height_expand
+                width_expand = self.width_expand
+                pad_img = cv2.copyMakeBorder(
+                    img,
+                    height_expand,
+                    height_expand,
+                    width_expand,
+                    width_expand,
+                    cv2.BORDER_CONSTANT,
+                    value=(127, 127, 127))
+
+            else:
+                height_expand = 0
+                width_expand = 0
+                pad_img = img.copy()
+
+            canvas = np.zeros(
+                shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32)
+
+            self.human_joint_box = self.__joint_to_body_box()
+
+            self.human_box = enlarge_box_tblr(
+                self.human_joint_box, pad_img, ratio=0.25)
+            human_box_height = self.human_box[1] - self.human_box[0]
+            human_box_width = self.human_box[3] - self.human_box[2]
+
+            self.leg_joint_box = self.__joint_to_leg_box()
+            self.leg_box = enlarge_box_tblr(
+                self.leg_joint_box, pad_img, ratio=0.25)
+
+            self.arm_joint_box = self.__joint_to_arm_box()
+            self.arm_box = enlarge_box_tblr(
+                self.arm_joint_box, pad_img, ratio=0.1)
+
+            x_flows = []
+            y_flows = []
+            multi_bbox = []
+
+            for scale in self.flow_scales:  # better for metric
+                scale_value = float(scale.split('_')[-1])
+
+                arm_box = copy.deepcopy(self.arm_box)
+
+                if arm_box[0] is None:
+                    arm_box = self.human_box
+
+                arm_box_height = arm_box[1] - arm_box[0]
+                arm_box_width = arm_box[3] - arm_box[2]
+
+                roi_bbox = None
+
+                if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1:
+                    roi_bbox = self.human_box
+                else:
+                    arm_box = enlarge_box_tblr(
+                        arm_box, pad_img, ratio=scale_value)
+                    if scale == 'upper_0.2':
+                        arm_box[0] = min(arm_box[0], int(self.joints[0][1]))
+                    if scale.startswith('upper'):
+                        roi_bbox = [
+                            max(self.human_box[0], arm_box[0]),
+                            min(self.human_box[1], arm_box[1]),
+                            max(self.human_box[2], arm_box[2]),
+                            min(self.human_box[3], arm_box[3])
+                        ]
+                        if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
+                                3] - roi_bbox[2] < 1:
+                            continue
+
+                    elif scale.startswith('lower'):
+                        roi_bbox = [
+                            max(self.human_box[0], self.leg_box[0]),
+                            min(self.human_box[1], self.leg_box[1]),
+                            max(self.human_box[2], self.leg_box[2]),
+                            min(self.human_box[3], self.leg_box[3])
+                        ]
+
+                        if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
+                                3] - roi_bbox[2] < 1:
+                            continue
+
+                skel_map, roi_bbox = gen_skeleton_map(
+                    self.joints, 'depth', input_roi_box=roi_bbox)
+
+                if roi_bbox is None:
+                    continue
+
+                if skel_map.dtype != np.float32:
+                    skel_map = skel_map.astype(np.float32)
+
+                skel_map -= 1.0  # [0,2] ->[-1,1]
+
+                multi_bbox.append(roi_bbox)
+
+                roi_bbox_height = roi_bbox[1] - roi_bbox[0]
+                roi_bbox_width = roi_bbox[3] - roi_bbox[2]
+
+                assert skel_map.shape[0] == roi_bbox_height
+                assert skel_map.shape[1] == roi_bbox_width
+                roi_height_pad = roi_bbox_height // self.divider
+                roi_width_pad = roi_bbox_width // self.divider
+                paded_roi_h = roi_bbox_height + 2 * roi_height_pad
+                paded_roi_w = roi_bbox_width + 2 * roi_width_pad
+
+                roi_height_pad_joint = skel_map.shape[0] // self.divider
+                roi_width_pad_joint = skel_map.shape[1] // self.divider
+                skel_map = np.pad(
+                    skel_map,
+                    ((roi_height_pad_joint, roi_height_pad_joint),
+                     (roi_width_pad_joint, roi_width_pad_joint), (0, 0)),
+                    'constant',
+                    constant_values=-1)
+
+                skel_map_resized = cv2.resize(
+                    skel_map, (self.network_input_W, self.network_input_H))
+
+                skel_map_resized[skel_map_resized < 0] = -1.0
+                skel_map_resized[skel_map_resized > -0.5] = 1.0
+                skel_map_transformed = torch.from_numpy(
+                    skel_map_resized.transpose((2, 0, 1)))
+
+                roi_npy = pad_img[roi_bbox[0]:roi_bbox[1],
+                                  roi_bbox[2]:roi_bbox[3], :].copy()
+                if roi_npy.dtype != np.float32:
+                    roi_npy = roi_npy.astype(np.float32)
+
+                roi_npy = np.pad(roi_npy,
+                                 ((roi_height_pad, roi_height_pad),
+                                  (roi_width_pad, roi_width_pad), (0, 0)),
+                                 'edge')
+
+                roi_npy = roi_npy[:, :, ::-1]
+
+                roi_npy = cv2.resize(
+                    roi_npy, (self.network_input_W, self.network_input_H))
+
+                roi_npy *= 1.0 / 255
+                roi_npy -= 0.5
+                roi_npy *= 2
+
+                rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1)))
+
+                rgb_tensor = rgb_tensor.unsqueeze(0).to(device)
+                skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device)
+                warped_img_val, flow_field_val = flow_net(
+                    rgb_tensor, skel_map_tensor
+                )  # inference, connectivity_mask [1,12,16,16]
+                flow_field_val = flow_field_val.detach().squeeze().cpu().numpy(
+                )
+
+                flow_field_val = cv2.resize(
+                    flow_field_val, (paded_roi_w, paded_roi_h),
+                    interpolation=cv2.INTER_LINEAR)
+                flow_field_val[..., 0] = flow_field_val[
+                    ..., 0] * paded_roi_w * 0.5 * 2 * self.coeff
+                flow_field_val[..., 1] = flow_field_val[
+                    ..., 1] * paded_roi_h * 0.5 * 2 * self.coeff
+
+                # remove pad areas
+                flow_field_val = flow_field_val[
+                    roi_height_pad:flow_field_val.shape[0] - roi_height_pad,
+                    roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :]
+
+                diffuse_width = max(roi_bbox_width // 3, 1)
+                diffuse_height = max(roi_bbox_height // 3, 1)
+                assert roi_bbox_width == flow_field_val.shape[1]
+                assert roi_bbox_height == flow_field_val.shape[0]
+
+                origin_flow = np.zeros(
+                    (pad_img.shape[0] + 2 * diffuse_height,
+                     pad_img.shape[1] + 2 * diffuse_width, 2),
+                    dtype=np.float32)
+
+                flow_field_val = np.pad(flow_field_val,
+                                        ((diffuse_height, diffuse_height),
+                                         (diffuse_width, diffuse_width),
+                                         (0, 0)), 'linear_ramp')
+
+                origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height,
+                            roi_bbox[2]:roi_bbox[3]
+                            + 2 * diffuse_width] = flow_field_val
+
+                origin_flow = origin_flow[diffuse_height:-diffuse_height,
+                                          diffuse_width:-diffuse_width, :]
+
+                x_flows.append(origin_flow[..., 0])
+                y_flows.append(origin_flow[..., 1])
+
+            if len(x_flows) == 0:
+                return {
+                    'rDx': np.zeros(canvas.shape[:2], dtype=np.float32),
+                    'rDy': np.zeros(canvas.shape[:2], dtype=np.float32),
+                    'multi_bbox': multi_bbox,
+                    'x_fusion_map':
+                    np.ones(canvas.shape[:2], dtype=np.float32),
+                    'y_fusion_map':
+                    np.ones(canvas.shape[:2], dtype=np.float32)
+                }
+            else:
+                origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow(
+                    x_flows, y_flows, device=device)
+
+            return {
+                'rDx': origin_rDx,
+                'rDy': origin_rDy,
+                'multi_bbox': multi_bbox,
+                'x_fusion_map': x_fusion_map,
+                'y_fusion_map': y_fusion_map
+            }
+
+    @staticmethod
+    def blend_multiscale_flow(x_flows, y_flows, device=None):
+        scale_num = len(x_flows)
+        if scale_num == 1:
+            return x_flows[0], y_flows[0], np.ones_like(
+                x_flows[0]), np.ones_like(x_flows[0])
+
+        origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]),
+                              dtype=np.float32)
+        origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]),
+                              dtype=np.float32)
+
+        x_fusion_map, x_acc_map = get_map_fusion_map_cuda(
+            x_flows, 1, device=device)
+        y_fusion_map, y_acc_map = get_map_fusion_map_cuda(
+            y_flows, 1, device=device)
+
+        x_flow_map = 1.0 / x_fusion_map
+        y_flow_map = 1.0 / y_fusion_map
+
+        all_acc_map = x_acc_map + y_acc_map
+        all_acc_map = all_acc_map.astype(np.uint8)
+        roi_box = get_mask_bbox(all_acc_map, threshold=1)
+
+        if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[
+                3] - roi_box[2] <= 0:
+            roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]]
+
+        roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1],
+                                    roi_box[2]:roi_box[3]]
+        roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1],
+                                    roi_box[2]:roi_box[3]]
+
+        roi_width = roi_x_flow_map.shape[1]
+        roi_height = roi_x_flow_map.shape[0]
+
+        roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320)
+        roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320)
+
+        roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55))
+        roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55))
+
+        roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height))
+        roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height))
+
+        x_flow_map[roi_box[0]:roi_box[1],
+                   roi_box[2]:roi_box[3]] = roi_x_flow_map
+        y_flow_map[roi_box[0]:roi_box[1],
+                   roi_box[2]:roi_box[3]] = roi_y_flow_map
+
+        for i in range(scale_num):
+            origin_rDx += x_flows[i]
+            origin_rDy += y_flows[i]
+
+        origin_rDx *= x_flow_map
+        origin_rDy *= y_flow_map
+
+        return origin_rDx, origin_rDy, x_flow_map, y_flow_map
+
+    def __joint_to_body_box(self):
+        joint_left = int(np.min(self.joints, axis=0)[0])
+        joint_right = int(np.max(self.joints, axis=0)[0])
+        joint_top = int(np.min(self.joints, axis=0)[1])
+        joint_bottom = int(np.max(self.joints, axis=0)[1])
+        return [joint_top, joint_bottom, joint_left, joint_right]
+
+    def __joint_to_leg_box(self):
+        leg_joints = self.joints[8:, :]
+        if np.max(leg_joints, axis=0)[2] < 0.05:
+            return [0, 0, 0, 0]
+        joint_left = int(np.min(leg_joints, axis=0)[0])
+        joint_right = int(np.max(leg_joints, axis=0)[0])
+        joint_top = int(np.min(leg_joints, axis=0)[1])
+        joint_bottom = int(np.max(leg_joints, axis=0)[1])
+        return [joint_top, joint_bottom, joint_left, joint_right]
+
+    def __joint_to_arm_box(self):
+        arm_joints = self.joints[2:8, :]
+        if np.max(arm_joints, axis=0)[2] < 0.05:
+            return [0, 0, 0, 0]
+        joint_left = int(np.min(arm_joints, axis=0)[0])
+        joint_right = int(np.max(arm_joints, axis=0)[0])
+        joint_top = int(np.min(arm_joints, axis=0)[1])
+        joint_bottom = int(np.max(arm_joints, axis=0)[1])
+        return [joint_top, joint_bottom, joint_left, joint_right]
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
new file mode 100644
index 00000000..45b02724
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
@@ -0,0 +1,272 @@
+# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
+
+import math
+
+import cv2
+import numpy as np
+import torch
+from scipy.ndimage.filters import gaussian_filter
+
+from .model import BodyposeModel
+from .util import pad_rightdown_corner, transfer
+
+
+class Body(object):
+
+    def __init__(self, model_path, device):
+        self.model = BodyposeModel().to(device)
+        model_dict = transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+
+    def __call__(self, oriImg):
+        scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre1 = 0.1
+        thre2 = 0.05
+        bodyparts = 18
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(
+                oriImg, (0, 0),
+                fx=scale,
+                fy=scale,
+                interpolation=cv2.INTER_CUBIC)
+            imageToTest_padded, pad = pad_rightdown_corner(
+                imageToTest, stride, padValue)
+            im = np.transpose(
+                np.float32(imageToTest_padded[:, :, :, np.newaxis]),
+                (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            with torch.no_grad():
+                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2),
+                                   (1, 2, 0))  # output 1 is heatmaps
+            heatmap = cv2.resize(
+                heatmap, (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC)
+            heatmap = heatmap[:imageToTest_padded.shape[0]
+                              - pad[2], :imageToTest_padded.shape[1]
+                              - pad[3], :]
+            heatmap = cv2.resize(
+                heatmap, (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC)
+
+            paf = np.transpose(np.squeeze(Mconv7_stage6_L1),
+                               (1, 2, 0))  # output 0 is PAFs
+            paf = cv2.resize(
+                paf, (0, 0),
+                fx=stride,
+                fy=stride,
+                interpolation=cv2.INTER_CUBIC)
+            paf = paf[:imageToTest_padded.shape[0]
+                      - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            paf = cv2.resize(
+                paf, (oriImg.shape[1], oriImg.shape[0]),
+                interpolation=cv2.INTER_CUBIC)
+
+            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+            paf_avg += +paf / len(multiplier)
+
+        all_peaks = []
+        peak_counter = 0
+
+        for part in range(bodyparts):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+            map_left = np.zeros(one_heatmap.shape)
+            map_left[1:, :] = one_heatmap[:-1, :]
+            map_right = np.zeros(one_heatmap.shape)
+            map_right[:-1, :] = one_heatmap[1:, :]
+            map_up = np.zeros(one_heatmap.shape)
+            map_up[:, 1:] = one_heatmap[:, :-1]
+            map_down = np.zeros(one_heatmap.shape)
+            map_down[:, :-1] = one_heatmap[:, 1:]
+
+            peaks_binary = np.logical_and.reduce(
+                (one_heatmap >= map_left, one_heatmap >= map_right,
+                 one_heatmap >= map_up, one_heatmap >= map_down,
+                 one_heatmap > thre1))
+            peaks = list(
+                zip(np.nonzero(peaks_binary)[1],
+                    np.nonzero(peaks_binary)[0]))  # note reverse
+            peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks]
+            peak_id = range(peak_counter, peak_counter + len(peaks))
+            peaks_with_score_and_id = [
+                peaks_with_score[i] + (peak_id[i], )
+                for i in range(len(peak_id))
+            ]
+
+            all_peaks.append(peaks_with_score_and_id)
+            peak_counter += len(peaks)
+
+        # find connection in the specified sequence, center 29 is in the position 15
+        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
+                   [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
+                   [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
+        # the middle joints heatmap correpondence
+        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
+                  [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
+                  [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
+                  [45, 46]]
+
+        connection_all = []
+        special_k = []
+        mid_num = 10
+
+        for k in range(len(mapIdx)):
+            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+            candA = all_peaks[limbSeq[k][0] - 1]
+            candB = all_peaks[limbSeq[k][1] - 1]
+            nA = len(candA)
+            nB = len(candB)
+            if (nA != 0 and nB != 0):
+                connection_candidate = []
+                for i in range(nA):
+                    for j in range(nB):
+                        vec = np.subtract(candB[j][:2], candA[i][:2])
+                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+                        norm = max(0.001, norm)
+                        vec = np.divide(vec, norm)
+
+                        startend = list(
+                            zip(
+                                np.linspace(
+                                    candA[i][0], candB[j][0], num=mid_num),
+                                np.linspace(
+                                    candA[i][1], candB[j][1], num=mid_num)))
+
+                        vec_x = np.array([
+                            score_mid[int(round(startend[item][1])),
+                                      int(round(startend[item][0])), 0]
+                            for item in range(len(startend))
+                        ])
+                        vec_y = np.array([
+                            score_mid[int(round(startend[item][1])),
+                                      int(round(startend[item][0])), 1]
+                            for item in range(len(startend))
+                        ])
+
+                        score_midpts = np.multiply(
+                            vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+                        temp1 = sum(score_midpts) / len(score_midpts)
+                        temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0)
+                        score_with_dist_prior = temp1 + temp2
+                        criterion1 = len(np.nonzero(
+                            score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+                        criterion2 = score_with_dist_prior > 0
+                        if criterion1 and criterion2:
+                            connection_candidate.append([
+                                i, j, score_with_dist_prior,
+                                score_with_dist_prior + candA[i][2]
+                                + candB[j][2]
+                            ])
+
+                connection_candidate = sorted(
+                    connection_candidate, key=lambda x: x[2], reverse=True)
+                connection = np.zeros((0, 5))
+                for c in range(len(connection_candidate)):
+                    i, j, s = connection_candidate[c][0:3]
+                    if (i not in connection[:, 3]
+                            and j not in connection[:, 4]):
+                        connection = np.vstack(
+                            [connection, [candA[i][3], candB[j][3], s, i, j]])
+                        if (len(connection) >= min(nA, nB)):
+                            break
+
+                connection_all.append(connection)
+            else:
+                special_k.append(k)
+                connection_all.append([])
+
+        # last number in each row is the total parts number of that person
+        # the second last number in each row is the score of the overall configuration
+        subset = -1 * np.ones((0, 20))
+        candidate = np.array(
+            [item for sublist in all_peaks for item in sublist])
+
+        for k in range(len(mapIdx)):
+            if k not in special_k:
+                partAs = connection_all[k][:, 0]
+                partBs = connection_all[k][:, 1]
+                indexA, indexB = np.array(limbSeq[k]) - 1
+
+                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
+                    found = 0
+                    subset_idx = [-1, -1]
+                    for j in range(len(subset)):  # 1:size(subset,1):
+                        if subset[j][indexA] == partAs[i] or subset[j][
+                                indexB] == partBs[i]:
+                            subset_idx[found] = j
+                            found += 1
+
+                    if found == 1:
+                        j = subset_idx[0]
+                        if subset[j][indexB] != partBs[i]:
+                            subset[j][indexB] = partBs[i]
+                            subset[j][-1] += 1
+                            subset[j][-2] += candidate[
+                                partBs[i].astype(int),
+                                2] + connection_all[k][i][2]
+                    elif found == 2:  # if found 2 and disjoint, merge them
+                        j1, j2 = subset_idx
+                        tmp1 = (subset[j1] >= 0).astype(int)
+                        tmp2 = (subset[j2] >= 0).astype(int)
+                        membership = (tmp1 + tmp2)[:-2]
+                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
+                            subset[j1][:-2] += (subset[j2][:-2] + 1)
+                            subset[j1][-2:] += subset[j2][-2:]
+                            subset[j1][-2] += connection_all[k][i][2]
+                            subset = np.delete(subset, j2, 0)
+                        else:  # as like found == 1
+                            subset[j1][indexB] = partBs[i]
+                            subset[j1][-1] += 1
+                            subset[j1][-2] += candidate[
+                                partBs[i].astype(int),
+                                2] + connection_all[k][i][2]
+
+                    # if find no partA in the subset, create a new subset
+                    elif not found and k < 17:
+                        row = -1 * np.ones(20)
+                        row[indexA] = partAs[i]
+                        row[indexB] = partBs[i]
+                        row[-1] = 2
+                        row[-2] = sum(
+                            candidate[connection_all[k][i, :2].astype(int),
+                                      2]) + connection_all[k][i][2]
+                        subset = np.vstack([subset, row])
+        # delete some rows of subset which has few parts occur
+        deleteIdx = []
+        for i in range(len(subset)):
+            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+                deleteIdx.append(i)
+        subset = np.delete(subset, deleteIdx, axis=0)
+
+        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+        # candidate: x, y, score, id
+        count = subset.shape[0]
+        joints = np.zeros(shape=(count, bodyparts, 3))
+
+        for i in range(count):
+            for j in range(bodyparts):
+                joints[i, j, :3] = candidate[int(subset[i, j]), :3]
+                confidence = 1.0 if subset[i, j] >= 0 else 0.0
+                joints[i, j, 2] *= confidence
+        return joints
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
new file mode 100644
index 00000000..12f6e84d
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py
@@ -0,0 +1,141 @@
+# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if 'pool' in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(
+                in_channels=v[0],
+                out_channels=v[1],
+                kernel_size=v[2],
+                stride=v[3],
+                padding=v[4])
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
+
+    return nn.Sequential(OrderedDict(layers))
+
+
+class BodyposeModel(nn.Module):
+
+    def __init__(self):
+        super(BodyposeModel, self).__init__()
+
+        # these layers have no relu layer
+        no_relu_layers = [
+            'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',
+            'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',
+            'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',
+            'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'
+        ]
+        blocks = {}
+        block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
+                              ('conv1_2', [64, 64, 3, 1, 1]),
+                              ('pool1_stage1', [2, 2, 0]),
+                              ('conv2_1', [64, 128, 3, 1, 1]),
+                              ('conv2_2', [128, 128, 3, 1, 1]),
+                              ('pool2_stage1', [2, 2, 0]),
+                              ('conv3_1', [128, 256, 3, 1, 1]),
+                              ('conv3_2', [256, 256, 3, 1, 1]),
+                              ('conv3_3', [256, 256, 3, 1, 1]),
+                              ('conv3_4', [256, 256, 3, 1, 1]),
+                              ('pool3_stage1', [2, 2, 0]),
+                              ('conv4_1', [256, 512, 3, 1, 1]),
+                              ('conv4_2', [512, 512, 3, 1, 1]),
+                              ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+                              ('conv4_4_CPM', [256, 128, 3, 1, 1])])
+
+        # Stage 1
+        block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+                                ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])
+
+        block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+                                ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+                                ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
+        blocks['block1_1'] = block1_1
+        blocks['block1_2'] = block1_2
+
+        self.model0 = make_layers(block0, no_relu_layers)
+
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks['block%d_1' % i] = OrderedDict([
+                ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+                ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+            ])
+
+            blocks['block%d_2' % i] = OrderedDict([
+                ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+                ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+                ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+            ])
+
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+        self.model1_1 = blocks['block1_1']
+        self.model2_1 = blocks['block2_1']
+        self.model3_1 = blocks['block3_1']
+        self.model4_1 = blocks['block4_1']
+        self.model5_1 = blocks['block5_1']
+        self.model6_1 = blocks['block6_1']
+
+        self.model1_2 = blocks['block1_2']
+        self.model2_2 = blocks['block2_2']
+        self.model3_2 = blocks['block3_2']
+        self.model4_2 = blocks['block4_2']
+        self.model5_2 = blocks['block5_2']
+        self.model6_2 = blocks['block6_2']
+
+    def forward(self, x):
+
+        out1 = self.model0(x)
+
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+
+        return out6_1, out6_2
diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
new file mode 100644
index 00000000..13a42074
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py
@@ -0,0 +1,33 @@
+# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
+import numpy as np
+
+
+def pad_rightdown_corner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+
+    pad = 4 * [None]
+    pad[0] = 0  # up
+    pad[1] = 0  # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride)  # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride)  # right
+
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+    return img_padded, pad
+
+
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights['.'.join(
+            weights_name.split('.')[1:])]
+    return transfered_model_weights
diff --git a/modelscope/models/cv/image_body_reshaping/slim_utils.py b/modelscope/models/cv/image_body_reshaping/slim_utils.py
new file mode 100644
index 00000000..23d5a741
--- /dev/null
+++ b/modelscope/models/cv/image_body_reshaping/slim_utils.py
@@ -0,0 +1,507 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import os
+import random
+
+import cv2
+import numba
+import numpy as np
+import torch
+
+
+def resize_on_long_side(img, long_side=800):
+    src_height = img.shape[0]
+    src_width = img.shape[1]
+
+    if src_height > src_width:
+        scale = long_side * 1.0 / src_height
+        _img = cv2.resize(
+            img, (int(src_width * scale), long_side),
+            interpolation=cv2.INTER_LINEAR)
+    else:
+        scale = long_side * 1.0 / src_width
+        _img = cv2.resize(
+            img, (long_side, int(src_height * scale)),
+            interpolation=cv2.INTER_LINEAR)
+
+    return _img, scale
+
+
+def point_in_box(pt, box):
+    pt_x = pt[0]
+    pt_y = pt[1]
+
+    if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[
+            1] and pt_y <= box[1] + box[3]:
+        return True
+    else:
+        return False
+
+
+def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True):
+    if roi_bbox is None or None in roi_bbox:
+        return [None, None, None, None]
+
+    top = roi_bbox[0]
+    bottom = roi_bbox[1]
+    left = roi_bbox[2]
+    right = roi_bbox[3]
+
+    roi_width = roi_bbox[3] - roi_bbox[2]
+    roi_height = roi_bbox[1] - roi_bbox[0]
+    right = left + roi_width
+    bottom = top + roi_height
+
+    long_side = roi_width if roi_width > roi_height else roi_height
+
+    if use_long_side:
+        new_left = left - int(long_side * ratio)
+    else:
+        new_left = left - int(roi_width * ratio)
+    new_left = 1 if new_left < 0 else new_left
+
+    if use_long_side:
+        new_top = top - int(long_side * ratio)
+    else:
+        new_top = top - int(roi_height * ratio)
+    new_top = 1 if new_top < 0 else new_top
+
+    if use_long_side:
+        new_right = right + int(long_side * ratio)
+    else:
+        new_right = right + int(roi_width * ratio)
+    new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right
+
+    if use_long_side:
+        new_bottom = bottom + int(long_side * ratio)
+    else:
+        new_bottom = bottom + int(roi_height * ratio)
+    new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom
+
+    bbox = [new_top, new_bottom, new_left, new_right]
+    return bbox
+
+
+def gen_PAF(image, joints):
+
+    assert joints.shape[0] == 18
+    assert joints.shape[1] == 3
+
+    org_h = image.shape[0]
+    org_w = image.shape[1]
+    small_image, resize_scale = resize_on_long_side(image, 120)
+
+    joints[:, :2] = joints[:, :2] * resize_scale
+
+    joint_left = int(np.min(joints, axis=0)[0])
+    joint_right = int(np.max(joints, axis=0)[0])
+    joint_top = int(np.min(joints, axis=0)[1])
+    joint_bottom = int(np.max(joints, axis=0)[1])
+
+    limb_width = min(
+        abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6
+
+    if limb_width % 2 == 0:
+        limb_width += 1
+    kernel_size = limb_width
+
+    part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12),
+                   (12, 13), (8, 9), (9, 10)]
+
+    map_list = []
+    mask_list = []
+    PAF_all = np.zeros(
+        shape=(small_image.shape[0], small_image.shape[1], 2),
+        dtype=np.float32)
+    for c, pair in enumerate(part_orders):
+        idx_a_name = pair[0]
+        idx_b_name = pair[1]
+
+        jointa = joints[idx_a_name]
+        jointb = joints[idx_b_name]
+
+        confidence_threshold = 0.05
+        if jointa[2] > confidence_threshold and jointb[
+                2] > confidence_threshold:
+            canvas = np.zeros(
+                shape=(small_image.shape[0], small_image.shape[1]),
+                dtype=np.uint8)
+
+            canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])),
+                              (int(jointb[0]), int(jointb[1])),
+                              (255, 255, 255), 5)
+
+            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                               (kernel_size, kernel_size))
+
+            canvas = cv2.dilate(canvas, kernel, 1)
+            canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0)
+            canvas = canvas.astype(np.float32) / 255
+            PAF = np.zeros(
+                shape=(small_image.shape[0], small_image.shape[1], 2),
+                dtype=np.float32)
+            PAF[..., 0] = jointb[0] - jointa[0]
+            PAF[..., 1] = jointb[1] - jointa[1]
+            mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1])
+            PAF /= (np.dstack((mag, mag)) + 1e-5)
+
+            single_PAF = PAF * np.dstack((canvas, canvas))
+            map_list.append(
+                cv2.GaussianBlur(single_PAF,
+                                 (kernel_size * 3, kernel_size * 3), 0))
+
+            mask_list.append(
+                cv2.GaussianBlur(canvas.copy(),
+                                 (kernel_size * 3, kernel_size * 3), 0))
+            PAF_all = PAF_all * (1.0 - np.dstack(
+                (canvas, canvas))) + single_PAF
+
+    PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0)
+    PAF_all = cv2.resize(
+        PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR)
+    map_list.append(PAF_all)
+    return PAF_all, map_list, mask_list
+
+
+def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None):
+    if type(joints) == list:
+        joints = np.array(joints)
+    assert stack_mode == 'column' or stack_mode == 'depth'
+
+    part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
+                   (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
+
+    def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0):
+        jointa = joints[a]
+        jointb = joints[b]
+
+        temp1 = int((jointa[0] - x_offset) * scale)
+        temp2 = int((jointa[1] - y_offset) * scale)
+        temp3 = int((jointb[0] - x_offset) * scale)
+        temp4 = int((jointb[1] - y_offset) * scale)
+
+        cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width)
+
+    roi_box = input_roi_box
+
+    roi_box_width = roi_box[3] - roi_box[2]
+    roi_box_height = roi_box[1] - roi_box[0]
+    short_side_length = min(roi_box_width, roi_box_height)
+    line_width = short_side_length // 30
+
+    line_width = max(line_width, 2)
+
+    map_cube = np.zeros(
+        shape=(roi_box_height, roi_box_width, len(part_orders) + 1),
+        dtype=np.float32)
+
+    use_line_width = min(5, line_width)
+    fx = use_line_width * 1.0 / line_width  # fx 最大值为1
+
+    if fx < 0.99:
+        map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx)
+
+    for c, pair in enumerate(part_orders):
+        tmp = map_cube[..., c].copy()
+        link(
+            tmp,
+            pair[0],
+            pair[1], (2.0, 2.0, 2.0),
+            use_line_width,
+            scale=fx,
+            x_offset=roi_box[2],
+            y_offset=roi_box[0])
+        map_cube[..., c] = tmp
+
+        tmp = map_cube[..., -1].copy()
+        link(
+            tmp,
+            pair[0],
+            pair[1], (2.0, 2.0, 2.0),
+            use_line_width,
+            scale=fx,
+            x_offset=roi_box[2],
+            y_offset=roi_box[0])
+        map_cube[..., -1] = tmp
+
+    map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height))
+
+    if stack_mode == 'depth':
+        return map_cube, roi_box
+    elif stack_mode == 'column':
+        joint_maps = []
+        for c in range(len(part_orders) + 1):
+            joint_maps.append(map_cube[..., c])
+        joint_map = np.column_stack(joint_maps)
+
+        return joint_map, roi_box
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    tl = line_thickness or round(
+        0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label, (c1[0], c1[1] - 2),
+            0,
+            tl / 3, [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA)
+
+
+def draw_line(im, points, color, stroke_size=2, closed=False):
+    points = points.astype(np.int32)
+    for i in range(len(points) - 1):
+        cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color,
+                 stroke_size)
+    if closed:
+        cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size)
+
+
+def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2):
+    left = bbox[0]
+    top = bbox[1]
+
+    right = bbox[2]
+    bottom = bbox[3]
+
+    roi_width = right - left
+    roi_height = bottom - top
+
+    new_left = left - int(roi_width * enlarge_ratio)
+    new_left = 0 if new_left < 0 else new_left
+
+    new_top = top - int(roi_height * enlarge_ratio)
+    new_top = 0 if new_top < 0 else new_top
+
+    new_right = right + int(roi_width * enlarge_ratio)
+    new_right = img_width if new_right > img_width else new_right
+
+    new_bottom = bottom + int(roi_height * enlarge_ratio)
+    new_bottom = img_height if new_bottom > img_height else new_bottom
+
+    bbox = [new_left, new_top, new_right, new_bottom]
+
+    bbox = [int(x) for x in bbox]
+
+    return bbox
+
+
+def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')):
+    map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list]
+    map_concat = torch.stack(tuple(map_list_cuda), dim=-1)
+
+    map_concat = torch.abs(map_concat)
+
+    map_concat[map_concat < threshold] = 0
+    map_concat[map_concat > 1e-5] = 1.0
+
+    sum_map = torch.sum(map_concat, dim=2)
+    a = torch.ones_like(sum_map)
+    acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map))
+
+    fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map)
+
+    fusion_map = fusion_map.float()
+    acc_map = acc_map.float()
+
+    fusion_map = fusion_map.cpu().numpy().astype(np.float32)
+    acc_map = acc_map.cpu().numpy().astype(np.float32)
+
+    return fusion_map, acc_map
+
+
+def gen_border_shade(height, width, height_band, width_band):
+    height_ratio = height_band * 1.0 / height
+    width_ratio = width_band * 1.0 / width
+
+    _height_band = int(256 * height_ratio)
+    _width_band = int(256 * width_ratio)
+
+    canvas = np.zeros((256, 256), dtype=np.float32)
+
+    canvas[_height_band // 2:-_height_band // 2,
+           _width_band // 2:-_width_band // 2] = 1.0
+
+    canvas = cv2.blur(canvas, (_height_band, _width_band))
+
+    canvas = cv2.resize(canvas, (width, height))
+
+    return canvas
+
+
+def get_mask_bbox(mask, threshold=127):
+    ret, mask = cv2.threshold(mask, threshold, 1, 0)
+
+    if cv2.countNonZero(mask) == 0:
+        return [None, None, None, None]
+
+    col_acc = np.sum(mask, 0)
+    row_acc = np.sum(mask, 1)
+
+    col_acc = col_acc.tolist()
+    row_acc = row_acc.tolist()
+
+    for x in range(len(col_acc)):
+        if col_acc[x] > 0:
+            left = x
+            break
+
+    for x in range(1, len(col_acc)):
+        if col_acc[-x] > 0:
+            right = len(col_acc) - x
+            break
+
+    for x in range(len(row_acc)):
+        if row_acc[x] > 0:
+            top = x
+            break
+
+    for x in range(1, len(row_acc)):
+        if row_acc[-x] > 0:
+            bottom = len(row_acc[::-1]) - x
+            break
+    return [top, bottom, left, right]
+
+
+def visualize_flow(flow):
+    h, w = flow.shape[:2]
+    hsv = np.zeros((h, w, 3), np.uint8)
+    mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
+
+    hsv[..., 0] = ang * 180 / np.pi / 2
+    hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
+    hsv[..., 2] = 255
+    bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
+    bgr = bgr * 1.0 / 255
+    return bgr.astype(np.float32)
+
+
+def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1):
+
+    part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
+                   (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
+
+    abandon_idxs = [0, 1, 14, 15, 16, 17]
+    # draw joints
+    for i, joint in enumerate(joints):
+        if i in abandon_idxs:
+            continue
+        if joint[-1] > confidence_threshold:
+
+            cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2)
+            if show_text:
+                cv2.putText(image,
+                            str(i) + '[{:.2f}]'.format(joint[-1]),
+                            (int(joint[0]), int(joint[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+    # draw link
+    for pair in part_orders:
+        if joints[pair[0]][-1] > confidence_threshold and joints[
+                pair[1]][-1] > confidence_threshold:
+            cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])),
+                     (int(joints[pair[1]][0]), int(joints[pair[1]][1])), color,
+                     2)
+    return image
+
+
+def get_heatmap_cv(img, magn, max_flow_mag):
+    min_flow_mag = .5
+    cv_magn = np.clip(
+        255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7),
+        a_min=0,
+        a_max=255).astype(np.uint8)
+    if img.dtype != np.uint8:
+        img = (255 * img).astype(np.uint8)
+
+    heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET)
+    heatmap_img = heatmap_img[..., ::-1]
+
+    h, w = magn.shape
+    img_alpha = np.ones((h, w), dtype=np.double)[:, :, None]
+    heatmap_alpha = np.clip(
+        magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7
+    heatmap_alpha[heatmap_alpha < .2]**.5
+    pm_hm = heatmap_img * heatmap_alpha
+    pm_img = img * img_alpha
+    cv_out = pm_hm + pm_img * (1 - heatmap_alpha)
+    cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8)
+
+    return cv_out
+
+
+def save_heatmap_cv(img, flow, supression=2):
+
+    flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2)
+    flow_magn -= supression
+    flow_magn[flow_magn <= 0] = 0
+    cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3)
+    return cv_out
+
+
+@numba.jit(nopython=True, parallel=False)
+def bilinear_interp(x, y, v11, v12, v21, v22):
+    temp1 = (v11 * (1 - y) + v12 * y) * (1 - x)
+    temp2 = (v21 * (1 - y) + v22 * y) * x
+    result = temp1 + temp2
+    return result
+
+
+@numba.jit(nopython=True, parallel=False)
+def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand,
+                     height_expand):
+    srcW = oriImg.shape[1]
+    srcH = oriImg.shape[0]
+
+    newImg = oriImg.copy()
+
+    for i in range(srcH):
+        for j in range(srcW):
+            _i = i
+            _j = j
+
+            deltaX = rDx[_i, _j]
+            deltaY = rDy[_i, _j]
+
+            nx = _j + deltaX * transRatio
+            ny = _i + deltaY * transRatio
+
+            if nx >= srcW - width_expand - 1:
+                if nx > srcW - 1:
+                    nx = srcW - 1
+
+            if ny >= srcH - height_expand - 1:
+                if ny > srcH - 1:
+                    ny = srcH - 1
+
+            if nx < width_expand:
+                if nx < 0:
+                    nx = 0
+
+            if ny < height_expand:
+                if ny < 0:
+                    ny = 0
+
+            nxi = int(math.floor(nx))
+            nyi = int(math.floor(ny))
+            nxi1 = int(math.ceil(nx))
+            nyi1 = int(math.ceil(ny))
+
+            for ll in range(3):
+                newImg[_i, _j,
+                       ll] = bilinear_interp(ny - nyi, nx - nxi,
+                                             oriImg[nyi, nxi,
+                                                    ll], oriImg[nyi, nxi1, ll],
+                                             oriImg[nyi1, nxi,
+                                                    ll], oriImg[nyi1, nxi1,
+                                                                ll])
+    return newImg
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 717ff4dd..c16e256e 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -184,6 +184,7 @@ TASK_OUTPUTS = {
     Tasks.image_to_image_translation: [OutputKeys.OUTPUT_IMG],
     Tasks.image_style_transfer: [OutputKeys.OUTPUT_IMG],
     Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG],
+    Tasks.image_body_reshaping: [OutputKeys.OUTPUT_IMG],
 
     # live category recognition result for single video
     # {
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 7fa66b5f..c9a70d14 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -75,6 +75,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_bart_text-error-correction_chinese'),
     Tasks.image_captioning: (Pipelines.image_captioning,
                              'damo/ofa_image-caption_coco_large_en'),
+    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
+                                 'damo/cv_flow-based-body-reshaping_damo'),
     Tasks.image_portrait_stylization:
     (Pipelines.person_image_cartoon,
      'damo/cv_unet_person-image-cartoon_compound-models'),
diff --git a/modelscope/pipelines/cv/image_body_reshaping_pipeline.py b/modelscope/pipelines/cv/image_body_reshaping_pipeline.py
new file mode 100644
index 00000000..c3600eb5
--- /dev/null
+++ b/modelscope/pipelines/cv/image_body_reshaping_pipeline.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_body_reshaping, module_name=Pipelines.image_body_reshaping)
+class ImageBodyReshapingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image body reshaping pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        logger.info('body reshaping model init done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        output = self.model.inference(input['img'])
+        result = {'outputs': output}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_img = inputs['outputs']
+        return {OutputKeys.OUTPUT_IMG: output_img}
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 5bc27c03..2331dc85 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -60,7 +60,7 @@ class CVTasks(object):
     image_to_image_generation = 'image-to-image-generation'
     image_style_transfer = 'image-style-transfer'
     image_portrait_stylization = 'image-portrait-stylization'
-
+    image_body_reshaping = 'image-body-reshaping'
     image_embedding = 'image-embedding'
 
     product_retrieval_embedding = 'product-retrieval-embedding'
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 5a2d7763..f907256d 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -13,6 +13,7 @@ ml_collections
 mmcls>=0.21.0
 mmdet>=2.25.0
 networkx>=2.5
+numba
 onnxruntime>=1.10
 pai-easycv>=0.6.3.6
 pandas
diff --git a/tests/pipelines/test_image_body_reshaping.py b/tests/pipelines/test_image_body_reshaping.py
new file mode 100644
index 00000000..e1955e94
--- /dev/null
+++ b/tests/pipelines/test_image_body_reshaping.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageBodyReshapingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_body_reshaping
+        self.model_id = 'damo/cv_flow-based-body-reshaping_damo'
+        self.test_image = 'data/test/images/image_body_reshaping.jpg'
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        if result is not None:
+            cv2.imwrite('result_bodyreshaping.png',
+                        result[OutputKeys.OUTPUT_IMG])
+            print(
+                f'Output written to {osp.abspath("result_body_reshaping.png")}'
+            )
+        else:
+            raise Exception('Testing failed: invalid output')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id)
+        image_body_reshaping = pipeline(
+            Tasks.image_body_reshaping, model=model_dir)
+        self.pipeline_inference(image_body_reshaping, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        image_body_reshaping = pipeline(
+            Tasks.image_body_reshaping, model=self.model_id)
+        self.pipeline_inference(image_body_reshaping, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        image_body_reshaping = pipeline(Tasks.image_body_reshaping)
+        self.pipeline_inference(image_body_reshaping, self.test_image)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 12ee711f682b5d90d35eb6c7ec024ccf87ee619a Mon Sep 17 00:00:00 2001
From: "wenqi.oywq" <wenqi.oywq@alibaba-inc.com>
Date: Tue, 11 Oct 2022 11:07:45 +0800
Subject: [PATCH 009/129] [to #42322933]add license header         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10353812

    * add license header
---
 modelscope/models/cv/image_color_enhance/csrnet.py             | 3 +++
 .../models/cv/image_color_enhance/image_color_enhance.py       | 1 +
 2 files changed, 4 insertions(+)

diff --git a/modelscope/models/cv/image_color_enhance/csrnet.py b/modelscope/models/cv/image_color_enhance/csrnet.py
index 782cd528..502abf88 100644
--- a/modelscope/models/cv/image_color_enhance/csrnet.py
+++ b/modelscope/models/cv/image_color_enhance/csrnet.py
@@ -1,3 +1,6 @@
+# The implementation is adopted from Jingwen He,
+# made publicly available at https://github.com/hejingwenhejingwen/CSRNet
+
 import functools
 import math
 
diff --git a/modelscope/models/cv/image_color_enhance/image_color_enhance.py b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
index 382cc152..0bd74197 100644
--- a/modelscope/models/cv/image_color_enhance/image_color_enhance.py
+++ b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from copy import deepcopy
 from typing import Dict, Union

From 4bd72e528ad9938e131908c5a67920666fcdcae1 Mon Sep 17 00:00:00 2001
From: pangda <pangda@alibaba-inc.com>
Date: Tue, 11 Oct 2022 11:14:34 +0800
Subject: [PATCH 010/129] [to #42322933] support restore best checkpoint after
 training
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 支持训练完成后自动恢复best ckpt，方便在不同测试集上进行测试
2. build_optimizer/build_lr_scheduler改为成员函数，方便重载（如模型分层lr）
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10348255
---
 modelscope/trainers/hooks/checkpoint_hook.py |  7 +++++++
 modelscope/trainers/trainer.py               | 10 ++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index 220929b8..c9f51a88 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -216,6 +216,7 @@ class BestCkptSaverHook(CheckpointHook):
         by_epoch (bool): Save best checkpoints by epoch or by iteration.
         save_optimizer (bool): Whether to save optimizer state dict.  Default: True.
         save_dir (str): Output directory to save best checkpoint.
+        restore_best (bool): Whether to restore the best checkpoint after training.
     """
 
     PRIORITY = Priority.LOW
@@ -228,6 +229,7 @@ class BestCkptSaverHook(CheckpointHook):
                  save_optimizer=True,
                  save_dir=None,
                  save_file_name=None,
+                 restore_best=False,
                  interval=0):
         assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.'
         super().__init__(
@@ -241,6 +243,7 @@ class BestCkptSaverHook(CheckpointHook):
         self._best_metric = None
         self._best_ckpt_file = None
         self.save_file_name = save_file_name
+        self.restore_best = restore_best
 
     def _should_save(self, trainer):
         return self._is_best_metric(trainer.metric_values)
@@ -305,3 +308,7 @@ class BestCkptSaverHook(CheckpointHook):
             self.logger.warn(
                 'The state_dict is not available, the best metric value will be affected.'
             )
+
+    def after_run(self, trainer):
+        if self.restore_best:
+            self.load_checkpoint(self._best_ckpt_file, trainer)
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index a01d9b59..4c21d63f 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -664,6 +664,12 @@ class EpochBasedTrainer(BaseTrainer):
         dataset = self.to_task_dataset(torch_dataset, mode)
         return dataset
 
+    def build_optimizer(self, cfg: ConfigDict, default_args: dict = None):
+        return build_optimizer(self.model, cfg=cfg, default_args=default_args)
+
+    def build_lr_scheduler(self, cfg: ConfigDict, default_args: dict = None):
+        return build_lr_scheduler(cfg=cfg, default_args=default_args)
+
     def create_optimizer_and_scheduler(self):
         """ Create optimizer and lr scheduler
 
@@ -680,7 +686,7 @@ class EpochBasedTrainer(BaseTrainer):
         optim_options = {}
         if optimizer_cfg is not None:
             optim_options = optimizer_cfg.pop('options', {})
-            optimizer = build_optimizer(self.model, cfg=optimizer_cfg)
+            optimizer = self.build_optimizer(cfg=optimizer_cfg)
 
         if lr_scheduler is None:
             lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None)
@@ -691,7 +697,7 @@ class EpochBasedTrainer(BaseTrainer):
         if lr_scheduler_cfg is not None:
             assert optimizer is not None
             lr_options = lr_scheduler_cfg.pop('options', {})
-            lr_scheduler = build_lr_scheduler(
+            lr_scheduler = self.build_lr_scheduler(
                 cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer})
 
         self.optimizer = optimizer

From 333c11c0a61c780a524d1b3b07793cff0d46a8da Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 11 Oct 2022 14:06:07 +0800
Subject: [PATCH 011/129] [to #42322933] fix: missing type bytes in
 InputType.AUDIO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

已经和谦言讨论过，确认可添加
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10358110

    * fix: missing type bytes in InputType.AUDIO
---
 modelscope/pipeline_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index de9814a7..2b14c278 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -28,7 +28,7 @@ class InputType(object):
 INPUT_TYPE = {
     InputType.IMAGE: (str, np.ndarray, Image.Image),
     InputType.TEXT: str,
-    InputType.AUDIO: (str, np.ndarray),
+    InputType.AUDIO: (str, bytes, np.ndarray),
     InputType.VIDEO: (str, np.ndarray, cv2.VideoCapture),
     InputType.BOX: (list, np.ndarray),
     InputType.DICT: (dict, type(None)),

From 09d2296f36f1a301dc5e144e00a692dfce2675ee Mon Sep 17 00:00:00 2001
From: "laiyin.lyc" <laiyin.lyc@alibaba-inc.com>
Date: Tue, 11 Oct 2022 16:05:20 +0800
Subject: [PATCH 012/129] [to #44847108] add sparsity hook (pst algorithm)     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10198228

    * [to #44847108] add sparsity hook (pst algorithm)
---
 modelscope/metainfo.py                        |   3 +
 modelscope/trainers/hooks/__init__.py         |   4 +-
 .../trainers/hooks/compression/__init__.py    |  24 ++
 .../hooks/compression/sparsity_hook.py        | 131 +++++++++++
 .../trainers/hooks/compression/utils.py       | 208 ++++++++++++++++++
 tests/trainers/hooks/compression/__init__.py  |   0
 .../hooks/compression/test_sparsity_hook.py   | 113 ++++++++++
 7 files changed, 482 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/trainers/hooks/compression/__init__.py
 create mode 100644 modelscope/trainers/hooks/compression/sparsity_hook.py
 create mode 100644 modelscope/trainers/hooks/compression/utils.py
 create mode 100644 tests/trainers/hooks/compression/__init__.py
 create mode 100644 tests/trainers/hooks/compression/test_sparsity_hook.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1b8c4720..77627abc 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -404,6 +404,9 @@ class Hooks(object):
     IterTimerHook = 'IterTimerHook'
     EvaluationHook = 'EvaluationHook'
 
+    # Compression
+    SparsityHook = 'SparsityHook'
+
 
 class LR_Schedulers(object):
     """learning rate scheduler is defined here
diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py
index f133041b..a2e0cf4b 100644
--- a/modelscope/trainers/hooks/__init__.py
+++ b/modelscope/trainers/hooks/__init__.py
@@ -6,10 +6,11 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .builder import HOOKS, build_hook
     from .checkpoint_hook import BestCkptSaverHook, CheckpointHook
+    from .compression import SparsityHook
     from .evaluation_hook import EvaluationHook
     from .hook import Hook
     from .iter_timer_hook import IterTimerHook
-    from .logger import TextLoggerHook, TensorboardHook
+    from .logger import TensorboardHook, TextLoggerHook
     from .lr_scheduler_hook import LrSchedulerHook
     from .optimizer import (ApexAMPOptimizerHook, NoneOptimizerHook,
                             OptimizerHook, TorchAMPOptimizerHook)
@@ -19,6 +20,7 @@ else:
     _import_structure = {
         'builder': ['HOOKS', 'build_hook'],
         'checkpoint_hook': ['BestCkptSaverHook', 'CheckpointHook'],
+        'compression': ['SparsityHook'],
         'evaluation_hook': ['EvaluationHook'],
         'hook': ['Hook'],
         'iter_timer_hook': ['IterTimerHook'],
diff --git a/modelscope/trainers/hooks/compression/__init__.py b/modelscope/trainers/hooks/compression/__init__.py
new file mode 100644
index 00000000..f755b2ca
--- /dev/null
+++ b/modelscope/trainers/hooks/compression/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .sparsity_hook import SparsityHook
+    from .utils import SparseLinear, convert_sparse_network
+
+else:
+    _import_structure = {
+        'sparsity_hook': ['SparsityHook'],
+        'utils': ['convert_sparse_network', 'SparseLinear'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/hooks/compression/sparsity_hook.py b/modelscope/trainers/hooks/compression/sparsity_hook.py
new file mode 100644
index 00000000..993488d8
--- /dev/null
+++ b/modelscope/trainers/hooks/compression/sparsity_hook.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+from modelscope import __version__
+from modelscope.metainfo import Hooks
+from modelscope.trainers.hooks.builder import HOOKS
+from modelscope.trainers.hooks.hook import Hook
+from modelscope.trainers.hooks.priority import Priority
+from modelscope.utils.checkpoint import save_checkpoint
+from modelscope.utils.torch_utils import is_master
+
+
+@HOOKS.register_module(module_name=Hooks.SparsityHook)
+class SparsityHook(Hook):
+
+    PRIORITY = Priority.HIGHEST
+
+    def __init__(self, pruning_method, config={}, save_dir=None):
+        self.pruning_method = pruning_method
+        self.save_dir = save_dir
+
+        self.compress_module = config.get('compress_module', [])
+        self.weight_rank = config.get('weight_rank', 8)
+        self.weight_beta = config.get('weight_beta', 1)
+        self.mask_rank = config.get('mask_rank', 8)
+        self.mask_alpha1 = config.get('mask_alpha1', 1)
+        self.mask_alpha2 = config.get('mask_alpha2', 1)
+
+        self.step = 0
+        self.total_step = 0
+        self.frequency = config.get('frequency', 1)
+        self.initial_warmup = config.get('initial_warmup', 0.1)
+        self.final_warmup = config.get('final_warmup', 0.3)
+        self.initial_sparsity = config.get('initial_sparsity', 0.0)
+        self.final_sparsity = config.get('final_sparsity', 0.0)
+
+    def before_run(self, trainer):
+        import torch
+
+        from .utils import SparseLinear, convert_sparse_network
+
+        if self.save_dir is None:
+            self.save_dir = trainer.work_dir
+
+        if len(self.compress_module) == 0:
+            convert_sparse_network(
+                trainer.model,
+                pruning_method=self.pruning_method,
+                weight_rank=self.weight_rank,
+                weight_beta=self.weight_beta,
+                mask_rank=self.mask_rank,
+                mask_alpha1=self.mask_alpha1,
+                mask_alpha2=self.mask_alpha2,
+                logger=trainer.logger,
+            )
+        else:
+            for cm in self.compress_module:
+                for name, module in trainer.model.named_modules():
+                    if name != cm:
+                        continue
+                    convert_sparse_network(
+                        module,
+                        pruning_method=self.pruning_method,
+                        weight_rank=self.weight_rank,
+                        weight_beta=self.weight_beta,
+                        mask_rank=self.mask_rank,
+                        mask_alpha1=self.mask_alpha1,
+                        mask_alpha2=self.mask_alpha2,
+                        logger=trainer.logger,
+                    )
+
+        for i in range(len(trainer.optimizer.param_groups)):
+            new_train_params = []
+            for param in trainer.optimizer.param_groups[i]['params']:
+                is_find = False
+                for name, module in trainer.model.named_modules():
+                    if isinstance(module, SparseLinear):
+                        if torch.equal(param.half(),
+                                       module.weight.data.half()):
+                            is_find = True
+                            break
+
+                if not is_find:
+                    new_train_params.append(param)
+
+            trainer.optimizer.param_groups[i]['params'] = new_train_params
+
+        new_params = []
+        for name, module in trainer.model.named_modules():
+            if isinstance(module, SparseLinear):
+                new_params.extend(
+                    [p for p in module.parameters() if p.requires_grad])
+
+        trainer.optimizer.add_param_group({'params': new_params})
+
+        self.total_step = trainer.iters_per_epoch * trainer._max_epochs
+
+    def before_train_iter(self, trainer):
+        from .utils import schedule_sparsity_ratio, update_network_sparsity
+
+        cur_sparsity = schedule_sparsity_ratio(
+            self.step,
+            self.total_step,
+            self.frequency,
+            self.initial_warmup,
+            self.final_warmup,
+            self.initial_sparsity,
+            self.final_sparsity,
+        )
+
+        update_network_sparsity(trainer.model, cur_sparsity)
+
+        if is_master():
+            trainer.logger.info(
+                f'Step[{self.step}/{self.total_step}] current sparsity ratio = {cur_sparsity}'
+            )
+
+        self.step += 1
+
+    def after_run(self, trainer):
+        from .utils import generate_sparse_model
+
+        generate_sparse_model(trainer.model, logger=trainer.logger)
+
+        self._save_checkpoint(trainer)
+
+    def _save_checkpoint(self, trainer):
+        if is_master():
+            trainer.logger.info('Saving checkpoint at final compress')
+        cur_save_name = os.path.join(self.save_dir, 'compress_model.pth')
+        save_checkpoint(trainer.model, cur_save_name, trainer.optimizer)
diff --git a/modelscope/trainers/hooks/compression/utils.py b/modelscope/trainers/hooks/compression/utils.py
new file mode 100644
index 00000000..59418201
--- /dev/null
+++ b/modelscope/trainers/hooks/compression/utils.py
@@ -0,0 +1,208 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+
+from modelscope.utils.torch_utils import is_master
+
+
+class SparseBinarizer(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, mask_scores, sparsity):
+        num_prune = int(mask_scores.numel() * sparsity)
+        prune_indices = torch.argsort(mask_scores.reshape(-1))[:num_prune]
+        mask = mask_scores.clone().fill_(1)
+        mask.reshape(-1)[prune_indices] = 0.0
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return gradOutput, None
+
+
+class SparseLinear(nn.Module):
+    """
+    Fully Connected layer with on the fly adaptive mask.
+    """
+
+    def __init__(
+        self,
+        module,
+        pruning_method='pst',
+        weight_rank=8,
+        weight_beta=1.0,
+        mask_rank=8,
+        mask_alpha1=1.0,
+        mask_alpha2=1.0,
+    ):
+        super(SparseLinear, self).__init__()
+        self.module = module
+        out_features = self.module.weight.shape[0]
+        in_features = self.module.weight.shape[1]
+
+        self.weight = self.module.weight
+        self.module.weight = None
+        self.module._parameters.pop('weight')
+
+        self.pruning_method = pruning_method
+
+        self.cur_sparsity = 0.0
+
+        if self.pruning_method == 'pst':
+            self.weight_rank = weight_rank
+            self.weight_beta = weight_beta
+            self.mask_rank = mask_rank
+            self.mask_alpha1 = mask_alpha1
+            self.mask_alpha2 = mask_alpha2
+
+            # create trainable params
+            self.weight_U = nn.Parameter(
+                torch.randn(out_features, self.weight_rank).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.weight_V = nn.Parameter(
+                torch.zeros(self.weight_rank, in_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+
+            self.mask_scores_A = nn.Parameter(
+                torch.randn(out_features, self.mask_rank).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.mask_scores_B = nn.Parameter(
+                torch.zeros(self.mask_rank, in_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.mask_scores_R = nn.Parameter(
+                torch.zeros(out_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+            self.mask_scores_C = nn.Parameter(
+                torch.zeros(in_features).to(
+                    device=self.weight.device, dtype=self.weight.dtype))
+
+            self.weight.requires_grad = False
+            if self.module.bias is not None:
+                self.module.bias.requires_grad = False
+
+    def forward(self, *inputs):
+        if self.pruning_method == 'pst':
+            weight = self.weight + self.weight_beta * self.weight_U @ self.weight_V
+            mask_scores = (
+                weight.abs()
+                + self.mask_alpha1 * self.mask_scores_A @ self.mask_scores_B
+                + self.mask_alpha2 * (self.mask_scores_R.unsqueeze(1)
+                                      + self.mask_scores_C.unsqueeze(0)))
+
+            mask = SparseBinarizer.apply(mask_scores, self.cur_sparsity)
+            masked_weight = mask * weight
+
+            self.module.weight = masked_weight
+            return self.module(*inputs)
+        else:
+            return self.module(*inputs)
+
+    def convert(self):
+        if self.pruning_method == 'pst':
+            weight = self.weight + self.weight_beta * self.weight_U @ self.weight_V
+            mask_scores = (
+                weight.abs()
+                + self.mask_alpha1 * self.mask_scores_A @ self.mask_scores_B
+                + self.mask_alpha2 * (self.mask_scores_R.unsqueeze(1)
+                                      + self.mask_scores_C.unsqueeze(0)))
+
+            mask = SparseBinarizer.apply(mask_scores, self.cur_sparsity)
+
+            masked_weight = mask * weight
+            self.module.weight = nn.Parameter(masked_weight.data)
+
+
+def _setattr(model, name, module):
+    name_list = name.split('.')
+    for name in name_list[:-1]:
+        model = getattr(model, name)
+    setattr(model, name_list[-1], module)
+
+
+def convert_sparse_network(
+    model,
+    pruning_method,
+    weight_rank,
+    weight_beta,
+    mask_rank,
+    mask_alpha1,
+    mask_alpha2,
+    logger=None,
+):
+    compress_module = [nn.Linear]
+    try:
+        from megatron import mpu
+        compress_module.extend(
+            [mpu.RowParallelLinear, mpu.ColumnParallelLinear])
+    except ImportError:
+        pass
+
+    for name, module in model.named_modules():
+        if type(module) in compress_module:
+            new_module = SparseLinear(
+                module,
+                pruning_method,
+                weight_rank,
+                weight_beta,
+                mask_rank,
+                mask_alpha1,
+                mask_alpha2,
+            )
+
+            # replace original module by new sparse module
+            _setattr(model, name, new_module)
+
+            if is_master():
+                if logger:
+                    logger.info(f'convert {name} to sparse module.')
+                else:
+                    print(f'convert {name} to sparse module.')
+
+
+def update_network_sparsity(model, sparsity):
+    for name, module in model.named_modules():
+        if isinstance(module, SparseLinear):
+            module.cur_sparsity = sparsity
+
+
+def schedule_sparsity_ratio(
+    step,
+    total_step,
+    frequency,
+    initial_warmup,
+    final_warmup,
+    initial_sparsity,
+    final_sparsity,
+):
+    if step <= initial_warmup * total_step:
+        sparsity = initial_sparsity
+    elif step > (total_step - final_warmup * total_step):
+        sparsity = final_sparsity
+    else:
+        spars_warmup_steps = initial_warmup * total_step
+        spars_schedu_steps = (final_warmup + initial_warmup) * total_step
+        step = (step - spars_warmup_steps) // frequency * frequency
+        mul_coeff = 1 - step / (total_step - spars_schedu_steps)
+        sparsity = final_sparsity + (initial_sparsity - final_sparsity) * (
+            mul_coeff**3)
+    return sparsity
+
+
+def generate_sparse_model(model, logger=None):
+    # generate sparse weight for saving
+    for name, module in model.named_modules():
+        if isinstance(module, SparseLinear):
+            module.convert()
+
+            _setattr(model, name, module.module)
+
+            if is_master():
+                if logger:
+                    logger.info(f'convert {name} weight to sparse weight, \
+                            sparsity ratio={torch.mean(1.0*(module.module.weight==0)).item()}.'
+                                )
+                else:
+                    print(f'convert {name} weight to sparse, \
+                            sparsity ratio={torch.mean(1.0*(module.module.weight==0)).item()}.'
+                          )
diff --git a/tests/trainers/hooks/compression/__init__.py b/tests/trainers/hooks/compression/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/hooks/compression/test_sparsity_hook.py b/tests/trainers/hooks/compression/test_sparsity_hook.py
new file mode 100644
index 00000000..4af4dcdb
--- /dev/null
+++ b/tests/trainers/hooks/compression/test_sparsity_hook.py
@@ -0,0 +1,113 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+import json
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import SGD
+from torch.optim.lr_scheduler import MultiStepLR
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, TrainerStages
+from modelscope.utils.test_utils import create_dummy_test_dataset
+
+dummy_dataset = create_dummy_test_dataset(
+    np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10)
+
+
+class DummyModel(nn.Module, Model):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(5, 10)
+        self.bn = nn.BatchNorm1d(10)
+
+    def forward(self, feat, labels):
+        x = self.linear(feat)
+
+        x = self.bn(x)
+        loss = torch.sum(x)
+        return dict(logits=x, loss=loss)
+
+
+class SparsityHookTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir)
+
+    def test_sparsity_hook(self):
+        json_cfg = {
+            'task': 'image_classification',
+            'train': {
+                'work_dir':
+                self.tmp_dir,
+                'dataloader': {
+                    'batch_size_per_gpu': 2,
+                    'workers_per_gpu': 1
+                },
+                'hooks': [{
+                    'type': 'SparsityHook',
+                    'pruning_method': 'pst',
+                    'config': {
+                        'weight_rank': 1,
+                        'mask_rank': 1,
+                        'final_sparsity': 0.9,
+                        'frequency': 1,
+                    },
+                }],
+            },
+        }
+
+        config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
+        with open(config_path, 'w') as f:
+            json.dump(json_cfg, f)
+
+        model = DummyModel()
+        optimizer = SGD(model.parameters(), lr=0.01)
+        lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4])
+        trainer_name = Trainers.default
+        kwargs = dict(
+            cfg_file=config_path,
+            model=model,
+            train_dataset=dummy_dataset,
+            optimizers=(optimizer, lr_scheduler),
+            max_epochs=5,
+            device='cpu',
+        )
+
+        trainer = build_trainer(trainer_name, kwargs)
+        train_dataloader = trainer._build_dataloader_with_dataset(
+            trainer.train_dataset, **trainer.cfg.train.get('dataloader', {}))
+        trainer.register_optimizers_hook()
+        trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
+        trainer.train_dataloader = train_dataloader
+        trainer.data_loader = train_dataloader
+        trainer.invoke_hook(TrainerStages.before_run)
+        for i in range(trainer._epoch, trainer._max_epochs):
+            trainer.invoke_hook(TrainerStages.before_train_epoch)
+            for _, data_batch in enumerate(train_dataloader):
+                trainer.invoke_hook(TrainerStages.before_train_iter)
+                trainer.train_step(trainer.model, data_batch)
+                trainer.invoke_hook(TrainerStages.after_train_iter)
+            trainer.invoke_hook(TrainerStages.after_train_epoch)
+        trainer.invoke_hook(TrainerStages.after_run)
+
+        self.assertEqual(
+            torch.mean(1.0 * (trainer.model.linear.weight == 0)), 0.9)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 67d6fa001da5cb58b81dcb68968355995f8e586f Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:17:51 +0800
Subject: [PATCH 013/129] [to #42322933] unify keys forbody_3d_keypoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

统一关键点检测输出key的名字
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10359335
---
 modelscope/outputs.py                                 | 4 ++--
 modelscope/pipelines/cv/body_3d_keypoints_pipeline.py | 4 ++--
 tests/pipelines/test_body_3d_keypoints.py             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index c16e256e..331f4816 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -222,7 +222,7 @@ TASK_OUTPUTS = {
 
     # 3D human body keypoints detection result for single sample
     # {
-    #   "poses": [		    # 3d pose coordinate in camera coordinate
+    #   "keypoints": [		    # 3d pose coordinate in camera coordinate
     #     	[[x, y, z]*17],	# joints of per image
     #     	[[x, y, z]*17],
     #     	...
@@ -236,7 +236,7 @@ TASK_OUTPUTS = {
     # and is only avaialbe when the "render" option is enabled.
     # }
     Tasks.body_3d_keypoints:
-    [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
+    [OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
 
     # 2D hand keypoints result for single sample
     # {
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index b0faa1e0..c3f4e8c1 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -180,7 +180,7 @@ class Body3DKeypointsPipeline(Pipeline):
         return res
 
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        res = {OutputKeys.POSES: [], OutputKeys.TIMESTAMPS: []}
+        res = {OutputKeys.KEYPOINTS: [], OutputKeys.TIMESTAMPS: []}
 
         if not input['success']:
             pass
@@ -197,7 +197,7 @@ class Body3DKeypointsPipeline(Pipeline):
                 self.render_prediction(pred_3d_pose, output_video_path)
                 res[OutputKeys.OUTPUT_VIDEO] = output_video_path
 
-            res[OutputKeys.POSES] = pred_3d_pose
+            res[OutputKeys.KEYPOINTS] = pred_3d_pose
             res[OutputKeys.TIMESTAMPS] = self.timestamps
         return res
 
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index 6f27f12d..6e671d2e 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -21,7 +21,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def pipeline_inference(self, pipeline: Pipeline, pipeline_input):
         output = pipeline(pipeline_input, output_video='./result.mp4')
-        poses = np.array(output[OutputKeys.POSES])
+        poses = np.array(output[OutputKeys.KEYPOINTS])
         print(f'result 3d points shape {poses.shape}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')

From e240edea7ebbd7beb66246fb18b071a6ba0a65c0 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:20:11 +0800
Subject: [PATCH 014/129] [to #42322933]t5 bug fixex

---
 modelscope/preprocessors/nlp/nlp_base.py     | 4 +---
 tests/pipelines/test_text2text_generation.py | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 6b559de9..a9be0cb0 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -417,14 +417,12 @@ class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
                  tokenizer=None,
                  mode=ModeKeys.INFERENCE,
                  **kwargs):
-        self.tokenizer = self.build_tokenizer(
-            model_dir) if tokenizer is None else tokenizer
         kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
         kwargs['padding'] = kwargs.get('padding', False)
         kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
                                                      False)
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
+        super().__init__(model_dir, mode=mode, **kwargs)
 
     def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
         text_a, _, _ = self.parse_text_and_label(data)
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index a39562f5..2506547e 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -18,7 +18,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         self.model_id = 'damo/t5-cn-base-test'
         self.input = '中国的首都位于<extra_id_0>。'
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_T5(self):
         cache_path = snapshot_download(self.model_id)
         model = T5ForConditionalGeneration(cache_path)
@@ -40,7 +40,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
             preprocessor=preprocessor)
         print(pipeline_ins(self.input))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_id(self):
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation, model=self.model_id)

From 65be443e982755a96915b363af6b6ad2dfe5c827 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:22:58 +0800
Subject: [PATCH 015/129] [to #41669377] Add more models to test in tts UT

Link https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10360754#tab=detail
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10360754
---
 .../audio/tts/models/datasets/__init__.py     |  0
 tests/pipelines/test_text_to_speech.py        | 70 +++++++++++++++----
 2 files changed, 58 insertions(+), 12 deletions(-)
 mode change 100755 => 100644 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index f659e59b..0caf1c84 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -27,21 +27,67 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
 
     def setUp(self) -> None:
         self.task = Tasks.text_to_speech
-        self.model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k'
+        zhcn_text = '今天北京天气怎么样'
+        en_text = 'How is the weather in Beijing?'
+        zhcn_voice = ['zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo']
+        enus_voice = ['andy', 'annie']
+        engb_voice = ['luca', 'luna']
+        self.tts_test_cases = []
+        for voice in zhcn_voice:
+            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
+                                                                      'zh-cn')
+            self.tts_test_cases.append({
+                'voice': voice,
+                'model_id': model_id,
+                'text': zhcn_text
+            })
+        for voice in enus_voice:
+            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
+                                                                      'en-us')
+            self.tts_test_cases.append({
+                'voice': voice,
+                'model_id': model_id,
+                'text': en_text
+            })
+        for voice in engb_voice:
+            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
+                                                                      'en-gb')
+            self.tts_test_cases.append({
+                'voice': voice,
+                'model_id': model_id,
+                'text': en_text
+            })
+        zhcn_model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k'
+        enus_model_id = 'damo/speech_sambert-hifigan_tts_en-us_16k'
+        engb_model_id = 'damo/speech_sambert-hifigan_tts_en-gb_16k'
+        self.tts_test_cases.append({
+            'voice': 'zhcn',
+            'model_id': zhcn_model_id,
+            'text': zhcn_text
+        })
+        self.tts_test_cases.append({
+            'voice': 'enus',
+            'model_id': enus_model_id,
+            'text': en_text
+        })
+        self.tts_test_cases.append({
+            'voice': 'engb',
+            'model_id': engb_model_id,
+            'text': en_text
+        })
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
-        text = '今天北京天气怎么样？'
-        voice = 'zhitian_emo'
-
-        model = Model.from_pretrained(
-            model_name_or_path=self.model_id, revision='pytorch_am')
-        sambert_hifigan_tts = pipeline(task=self.task, model=model)
-        self.assertTrue(sambert_hifigan_tts is not None)
-        output = sambert_hifigan_tts(input=text, voice=voice)
-        self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
-        pcm = output[OutputKeys.OUTPUT_PCM]
-        write('output.wav', 16000, pcm)
+        for case in self.tts_test_cases:
+            logger.info('test %s' % case['voice'])
+            model = Model.from_pretrained(
+                model_name_or_path=case['model_id'], revision='pytorch_am')
+            sambert_hifigan_tts = pipeline(task=self.task, model=model)
+            self.assertTrue(sambert_hifigan_tts is not None)
+            output = sambert_hifigan_tts(input=case['text'])
+            self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
+            pcm = output[OutputKeys.OUTPUT_PCM]
+            write('output_%s.wav' % case['voice'], 16000, pcm)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From 4c993638046a051a624375241ae5271509ebb510 Mon Sep 17 00:00:00 2001
From: "james.wjg" <james.wjg@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:24:46 +0800
Subject: [PATCH 016/129] =?UTF-8?q?[to=20#42322933]video=20summarization?=
 =?UTF-8?q?=20=E6=B7=BB=E5=8A=A0=20license=20&=20header;=20=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9=20output=20for=20demo=20service?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

video summarization:
1. 添加 license & header;
2. 修改 output for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260946
---
 .../metrics/video_summarization_metric.py     |  3 ++
 .../models/cv/video_summarization/__init__.py | 23 +++++++++-
 .../cv/video_summarization/base_model.py      |  3 +-
 .../cv/video_summarization/kts/cpd_auto.py    |  3 +-
 .../cv/video_summarization/kts/cpd_nonlin.py  |  3 +-
 .../models/cv/video_summarization/pgl_sum.py  |  3 +-
 .../cv/video_summarization/summarizer.py      | 46 ++++++++++++++++++-
 .../video_summarization_dataset.py            |  5 +-
 modelscope/outputs.py                         | 16 +++++++
 .../cv/video_summarization_pipeline.py        | 13 ++++--
 tests/pipelines/test_video_summarization.py   |  3 --
 11 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/modelscope/metrics/video_summarization_metric.py b/modelscope/metrics/video_summarization_metric.py
index d1867600..40580382 100644
--- a/modelscope/metrics/video_summarization_metric.py
+++ b/modelscope/metrics/video_summarization_metric.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
+
 from typing import Dict
 
 import numpy as np
diff --git a/modelscope/models/cv/video_summarization/__init__.py b/modelscope/models/cv/video_summarization/__init__.py
index 064110f7..15ad61b4 100644
--- a/modelscope/models/cv/video_summarization/__init__.py
+++ b/modelscope/models/cv/video_summarization/__init__.py
@@ -1 +1,22 @@
-from .summarizer import PGLVideoSummarization
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .summarizer import (PGLVideoSummarization, summary_format)
+
+else:
+    _import_structure = {
+        'summarizer': ['PGLVideoSummarization', 'summary_format']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_summarization/base_model.py b/modelscope/models/cv/video_summarization/base_model.py
index 670da251..912ba68d 100644
--- a/modelscope/models/cv/video_summarization/base_model.py
+++ b/modelscope/models/cv/video_summarization/base_model.py
@@ -1,4 +1,5 @@
-# The implementation is based on pytorch-caffe-models, available at https://github.com/crowsonkb/pytorch-caffe-models.
+# Part of the implementation is borrowed and modified from pytorch-caffe-models,
+# publicly available at https://github.com/crowsonkb/pytorch-caffe-models
 
 import cv2
 import numpy as np
diff --git a/modelscope/models/cv/video_summarization/kts/cpd_auto.py b/modelscope/models/cv/video_summarization/kts/cpd_auto.py
index a794ca26..58281df8 100644
--- a/modelscope/models/cv/video_summarization/kts/cpd_auto.py
+++ b/modelscope/models/cv/video_summarization/kts/cpd_auto.py
@@ -1,4 +1,5 @@
-# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS.
+# Part of the implementation is borrowed and modified from KTS,
+# publicly available at https://github.com/TatsuyaShirakawa/KTS
 
 import numpy as np
 
diff --git a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
index ef2eb6ef..55e279e9 100644
--- a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
+++ b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py
@@ -1,4 +1,5 @@
-# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS.
+# Part of the implementation is borrowed and modified from KTS,
+# publicly available at https://github.com/TatsuyaShirakawa/KTS
 
 import numpy as np
 
diff --git a/modelscope/models/cv/video_summarization/pgl_sum.py b/modelscope/models/cv/video_summarization/pgl_sum.py
index ab3010c9..2d27501d 100644
--- a/modelscope/models/cv/video_summarization/pgl_sum.py
+++ b/modelscope/models/cv/video_summarization/pgl_sum.py
@@ -1,4 +1,5 @@
-# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM.
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
 
 import math
 
diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py
index c95da025..75251989 100644
--- a/modelscope/models/cv/video_summarization/summarizer.py
+++ b/modelscope/models/cv/video_summarization/summarizer.py
@@ -1,4 +1,5 @@
-# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM.
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
 
 import os.path as osp
 from copy import deepcopy
@@ -23,7 +24,8 @@ logger = get_logger()
 def get_change_points(video_feat, n_frame):
     video_feat = np.array(video_feat, np.float32)
     K = np.dot(video_feat, video_feat.T)
-    change_points, _ = cpd_auto(K, ncp=120, vmax=2.2 / 4.0, lmin=1)
+    change_points, _ = cpd_auto(
+        K, ncp=min(K.shape[0] - 1, 120), vmax=2.2 / 4.0, lmin=1)
     change_points = change_points * 15
     change_points = np.concatenate(([0], change_points, [n_frame - 1]))
 
@@ -135,6 +137,46 @@ def generate_summary(all_shot_bound, all_scores, all_nframes, all_positions):
     return all_summaries
 
 
+def transform_time(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
+
+
+def summary_format(summary, fps):
+    frames_list = []
+    start_frame = -1
+    end_frame = -1
+    is_summary_frame = False
+    for i, idx in enumerate(summary):
+        if idx:
+            if is_summary_frame is False:
+                start_frame = i
+                is_summary_frame = True
+        else:
+            if is_summary_frame:
+                end_frame = i - 1
+                frames_list.append([start_frame, end_frame])
+                is_summary_frame = False
+
+    if is_summary_frame and summary[-1] == 1:
+        end_frame = len(frame_idxes) - 1
+        frames_list.append([start_frame, end_frame])
+
+    output = []
+    for seg in frames_list:
+        output.append({
+            'frame':
+            seg,
+            'timestamps': [
+                transform_time(seg[0] / float(fps)),
+                transform_time(seg[1] / float(fps))
+            ]
+        })
+    return output
+
+
 @MODELS.register_module(
     Tasks.video_summarization, module_name=Models.video_summarization)
 class PGLVideoSummarization(TorchModel):
diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
index 89deb7ba..34eb0450 100644
--- a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
+++ b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py
@@ -1,3 +1,6 @@
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
+
 import os
 
 import h5py
@@ -15,7 +18,7 @@ class VideoSummarizationDataset(TorchTaskDataset):
         self.mode = mode
         self.data_filename = os.path.join(root_dir, opt.dataset_file)
         self.split_filename = os.path.join(root_dir, opt.split_file)
-        self.split_index = opt.split_index  # it represents the current split (varies from 0 to 4)
+        self.split_index = opt.split_index
         hdf = h5py.File(self.data_filename, 'r')
         self.list_frame_features, self.list_gtscores = [], []
         self.list_user_summary = []
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 331f4816..07a14191 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -337,6 +337,22 @@ TASK_OUTPUTS = {
         OutputKeys.SCENE_META_LIST
     ],
 
+    # video summarization result for a single video
+    # {
+    #        "output":
+    #        [
+    #           {
+    #               "frame": [start_frame, end_frame]
+    #               "timestamps": [start_time, end_time]
+    #           },
+    #           {
+    #               "frame": [start_frame, end_frame]
+    #               "timestamps": [start_time, end_time]
+    #           }
+    #        ]
+    # }
+    Tasks.video_summarization: [OutputKeys.OUTPUT],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py
index 25ea1e7c..e4fe206d 100644
--- a/modelscope/pipelines/cv/video_summarization_pipeline.py
+++ b/modelscope/pipelines/cv/video_summarization_pipeline.py
@@ -1,4 +1,6 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed and modified from PGL-SUM,
+# publicly available at https://github.com/e-apostolidis/PGL-SUM
+
 import os.path as osp
 from typing import Any, Dict
 
@@ -8,7 +10,8 @@ import torch
 from tqdm import tqdm
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.video_summarization import PGLVideoSummarization
+from modelscope.models.cv.video_summarization import (PGLVideoSummarization,
+                                                      summary_format)
 from modelscope.models.cv.video_summarization.base_model import bvlc_googlenet
 from modelscope.models.cv.video_summarization.summarizer import (
     generate_summary, get_change_points)
@@ -57,6 +60,8 @@ class VideoSummarizationPipeline(Pipeline):
         frames = []
         picks = []
         cap = cv2.VideoCapture(input)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        self.frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
         frame_idx = 0
         while (cap.isOpened()):
             ret, frame = cap.read()
@@ -89,7 +94,9 @@ class VideoSummarizationPipeline(Pipeline):
         summary = self.inference(frame_features, input['n_frame'],
                                  input['picks'], change_points)
 
-        return {OutputKeys.OUTPUT: summary}
+        output = summary_format(summary, self.fps)
+
+        return {OutputKeys.OUTPUT: output}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py
index 6dcc31e9..1f965c53 100644
--- a/tests/pipelines/test_video_summarization.py
+++ b/tests/pipelines/test_video_summarization.py
@@ -3,7 +3,6 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.cv.image_utils import show_video_summarization_result
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
@@ -22,8 +21,6 @@ class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck):
         result = summarization_pipeline(video_path)
 
         print(f'video summarization output: \n{result}.')
-        show_video_summarization_result(video_path, result,
-                                        './summarization_result.avi')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_modelhub_default_model(self):

From 02c913a0fee0bbb0a6ade9086c9c142f508ab3e0 Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Tue, 11 Oct 2022 17:26:43 +0800
Subject: [PATCH 017/129] [to #42322933] add plug doc string         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10337105

---
 .../models/nlp/plug/configuration_plug.py     | 165 +++++++-----
 .../models/nlp/plug/distributed_plug.py       |  44 +++-
 modelscope/models/nlp/plug/modeling_plug.py   | 243 ++++++++----------
 3 files changed, 240 insertions(+), 212 deletions(-)

diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py
index 64807392..c3a526a9 100644
--- a/modelscope/models/nlp/plug/configuration_plug.py
+++ b/modelscope/models/nlp/plug/configuration_plug.py
@@ -40,8 +40,6 @@ class PlugNLUConfig(PretrainedConfig):
                  max_position_embeddings=2048,
                  type_vocab_size=3,
                  initializer_range=0.00707,
-                 deep_init=False,
-                 deepspeed=False,
                  lr_decay_style='linear',
                  weight_decay=1e-2,
                  clip_grad=1.0,
@@ -53,20 +51,7 @@ class PlugNLUConfig(PretrainedConfig):
                  fp32_tokentypes=False,
                  layernorm_epsilon=1e-5,
                  dec_hidden_layers=6,
-                 pruning_method=None,
-                 pruning_mask_init='constant',
-                 pruning_mask_scale=0.0,
-                 pruning_initial_threshold=1.0,
-                 pruning_final_threshold=0.01,
-                 pruning_initial_warmup=1,
-                 pruning_final_warmup=20,
-                 pruning_module='decoder',
-                 pruning_decay_step=50,
-                 pruning_decay_type='exp',
-                 ft_module=None,
                  attn_separate=False,
-                 LR_weight_rank=8,
-                 LR_mask_rank=8,
                  **kwargs):
         super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
 
@@ -82,8 +67,6 @@ class PlugNLUConfig(PretrainedConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.deep_init = deep_init
-        self.deepspeed = deepspeed
         self.lr_decay_style = lr_decay_style
         self.weight_decay = weight_decay
         self.clip_grad = clip_grad
@@ -95,20 +78,7 @@ class PlugNLUConfig(PretrainedConfig):
         self.layernorm_epsilon = layernorm_epsilon
         self.fp32_tokentypes = fp32_tokentypes
         self.dec_hidden_layers = dec_hidden_layers
-        self.pruning_method = pruning_method
-        self.pruning_mask_init = pruning_mask_init
-        self.pruning_mask_scale = pruning_mask_scale
-        self.pruning_module = pruning_module
-        self.pruning_initial_threshold = pruning_initial_threshold
-        self.pruning_final_threshold = pruning_final_threshold
-        self.pruning_initial_warmup = pruning_initial_warmup
-        self.pruning_final_warmup = pruning_final_warmup
-        self.pruning_decay_step = pruning_decay_step
-        self.pruning_decay_type = pruning_decay_type
-        self.ft_module = ft_module
         self.attn_separate = attn_separate
-        self.LR_weight_rank = LR_weight_rank
-        self.LR_mask_rank = LR_mask_rank
 
     @classmethod
     def from_dict(cls, json_object):
@@ -148,47 +118,115 @@ class PlugNLUConfig(PretrainedConfig):
 
 
 class PlugNLGConfig(PlugNLUConfig):
+    """
+    This is the configuration class to store the configuration of a [`PlugModel`]. It is used to instantiate a
+    PLUG understanding model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PLUG
+    [PLUG](https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary) architecture.
+
+    Configuration objects inherit from [`PlugNLUConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PlugNLUConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 21504):
+            Padded vocabulary size of the PLUG model for vocab tensor parallel. Defines the number of different tokens
+            that can be represented by the `inputs_ids` passed when calling [`PlugModel`].
+        original_vocab_size (`int`, *optional*, defaults to 21128):
+            True vocabulary size of the PLUG model. Defines the number of different tokens that can be represented.
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        dec_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 128):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 32768):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the Transformer Attention.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 3):
+            The vocabulary size of the `token_type_ids` passed when calling [`PlugModel`].
+        initializer_range (`float`, *optional*, defaults to 0.00707):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        lr_decay_style (`str`, *optional*, defaults to 'linear'):
+            The decay style of learning rate during fine-tunining. If string, `"linear"`, `"cosine"`, `"exponential"`,
+            `"constant"`, `"None"` are supported.
+        weight_decay (`float`, *optional*, defaults to 1e-2):
+            Decoupled weight decay to apply.
+        clip_grad (`float`, *optional*, defaults to 1.0):
+            Maximum gradient norm for gradient clipping.
+        warmup (`float`, *optional*, defaults to 0.01):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        pre_ln (`boolean`, *optional*, defaults to `True`):
+            Whether or not to apply LayerNorm to the input instead of the output in the blocks.
+        fp16 (`boolean`, *optional*, defaults to `True`):
+            Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
+        fp32_layernorm (`boolean`, *optional*, defaults to `True`):
+            Whether to use fp32 32-bit precision LayerNorm training while the argument `fp16` set to `True`.
+        fp32_embedding (`boolean`, *optional*, defaults to `False`):
+            Whether to use fp32 32-bit precision Embedding training while the argument `fp16` set to `True`.
+        fp32_tokentypes (`boolean`, *optional*, defaults to `False`):
+            Whether to use fp32 32-bit precision token types training while the argument `fp16` set to `True`.
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        attn_separate (`boolean`, *optional*, defaults to `False`):
+            Whether or not to separate query-key-value to query, key, value in the Attention.
+
+    Example:
+
+    ```python
+    >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
+    >>> # here only initializes a slice of the model on a single GPU.
+    >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
+    >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel
+
+    >>> # Initializing a Plug configuration
+    >>> configuration = PlugNLGConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = PlugModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
     model_type = 'plugNLG'
 
     def __init__(self,
                  vocab_size=21504,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
+                 original_vocab_size=21128,
+                 hidden_size=8192,
+                 num_hidden_layers=24,
+                 dec_hidden_layers=6,
+                 num_attention_heads=128,
+                 intermediate_size=32768,
                  hidden_act='gelu',
                  hidden_dropout_prob=0.1,
                  attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
+                 max_position_embeddings=2048,
+                 type_vocab_size=3,
                  initializer_range=0.00707,
-                 deep_init=False,
-                 deepspeed=False,
                  lr_decay_style='linear',
                  weight_decay=1e-2,
                  clip_grad=1.0,
                  warmup=0.01,
-                 pre_ln=False,
-                 fp16=False,
-                 fp32_layernorm=False,
+                 pre_ln=True,
+                 fp16=True,
+                 fp32_layernorm=True,
                  fp32_embedding=False,
                  fp32_tokentypes=False,
-                 layernorm_epsilon=1e-12,
-                 dec_hidden_layers=6,
-                 pruning_method=None,
-                 pruning_mask_init='constant',
-                 pruning_mask_scale=0.0,
-                 pruning_initial_threshold=1.0,
-                 pruning_final_threshold=0.01,
-                 pruning_initial_warmup=1,
-                 pruning_final_warmup=20,
-                 pruning_module='decoder',
-                 pruning_decay_step=50,
-                 pruning_decay_type='exp',
-                 ft_module=None,
+                 layernorm_epsilon=1e-5,
                  attn_separate=False,
-                 LR_weight_rank=8,
-                 LR_mask_rank=8,
                  **kwargs):
         super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
 
@@ -203,8 +241,6 @@ class PlugNLGConfig(PlugNLUConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.deep_init = deep_init
-        self.deepspeed = deepspeed
         self.lr_decay_style = lr_decay_style
         self.weight_decay = weight_decay
         self.clip_grad = clip_grad
@@ -216,17 +252,4 @@ class PlugNLGConfig(PlugNLUConfig):
         self.layernorm_epsilon = layernorm_epsilon
         self.fp32_tokentypes = fp32_tokentypes
         self.dec_hidden_layers = dec_hidden_layers
-        self.pruning_method = pruning_method
-        self.pruning_mask_init = pruning_mask_init
-        self.pruning_mask_scale = pruning_mask_scale
-        self.pruning_module = pruning_module
-        self.pruning_initial_threshold = pruning_initial_threshold
-        self.pruning_final_threshold = pruning_final_threshold
-        self.pruning_initial_warmup = pruning_initial_warmup
-        self.pruning_final_warmup = pruning_final_warmup
-        self.pruning_decay_step = pruning_decay_step
-        self.pruning_decay_type = pruning_decay_type
-        self.ft_module = ft_module
         self.attn_separate = attn_separate
-        self.LR_weight_rank = LR_weight_rank
-        self.LR_mask_rank = LR_mask_rank
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
index 2992f595..06009ba1 100644
--- a/modelscope/models/nlp/plug/distributed_plug.py
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -20,6 +20,48 @@ logger = get_logger(__name__)
 
 
 class DistributedPlug(TorchModel):
+    """
+    The wapper class of PLUG Model to initialize parallel environment, load model weights, generate sentences.
+    Parameters:
+        model_dir (`str`, *required*):
+            Path to model damo/nlp_plug_text-generation_27B.
+        The model structure in model_dir should be like this:
+        model_dir
+            |_ config.json
+            |_ configuration.json
+            |_ ds_zero-offload_10B_config.json
+            |_ vocab.txt
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        rank (`int`, *required*):
+            Used to identify different GPUs in a tensor parallel environment. eg. The rank of GPU #0 is 0, and the
+            model file `mp_rank_00_model_states.pt` will be loaded on this GPU.
+        world_size (`int`, *required*, defaults to 8):
+            The parallel size in total.
+        model_parallel_size (`int`, *required*, defaults to 8):
+            The parallel size of model(tensor parallel).
+        master_ip (`str`, *required*):
+            The master IP, can usually be set to `"127.0.0.1"`, used as part of
+            [`~torch.distributed.init_process_group`] method parameter `init_method`.
+            `init_method` = `"tcp://{master_ip}:{master_port}"`
+        master_port (`str`, *required*):
+            The master port, can usually be set to `"29500"`, used as part of
+            [`~torch.distributed.init_process_group`] method parameter `init_method`.
+            `init_method` = `"tcp://{master_ip}:{master_port}"`
+        seed (`int`, *optional*, defaults to 42):
+            Random seed to control sampling.
+    """
 
     def __init__(self, model_dir, rank, **kwargs):
         super().__init__(model_dir, **kwargs)
@@ -29,7 +71,7 @@ class DistributedPlug(TorchModel):
         initialize_distributed(rank, mpu, kwargs['world_size'],
                                kwargs['model_parallel_size'],
                                kwargs['master_ip'], kwargs['master_port'])
-        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        seed = 42 if 'seed' not in kwargs else kwargs['seed']
         set_random_seed_mpu(seed)
         self.iteration = 0
         self.dist_model = self.initialize_model(path_load_tag='model')
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py
index 9d2bb14f..df00006b 100644
--- a/modelscope/models/nlp/plug/modeling_plug.py
+++ b/modelscope/models/nlp/plug/modeling_plug.py
@@ -152,15 +152,7 @@ class BertSelfOutput(nn.Module):
             bias=True,
             input_is_parallel=True,
             stride=1,
-            init_method=init_method,
-            pruning_method=config.pruning_method if config.pruning_module in [
-                'all', 'encoder', 'encoder_self', 'encoder_selfvo',
-                'encoder_selfo'
-            ] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+            init_method=init_method)
         self.fp32_layernorm = config.fp32_layernorm
         if not config.pre_ln:
             self.LayerNorm = BertLayerNorm(
@@ -173,12 +165,8 @@ class BertSelfOutput(nn.Module):
         self,
         hidden_states,
         input_tensor,
-        pruning_threshold=None,
     ):
-        hidden_states = self.dense(
-            hidden_states,
-            pruning_threshold=pruning_threshold,
-        )
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         ln_input = hidden_states + input_tensor
         if self.LayerNorm is not None:
@@ -210,20 +198,13 @@ class BertAttention(nn.Module):
             output_parallel=True,
             init_method=normal_init_method(
                 mean=0.0, std=config.initializer_range),
-            separate=config.attn_separate,
-            pruning_method=config.pruning_method,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            pruning_module=config.pruning_module,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+            separate=config.attn_separate)
         self.output = BertSelfOutput(config)
 
     def forward(
         self,
         input_tensor,
         attention_mask,
-        pruning_threshold=None,
     ):
         if self.LayerNorm is not None:
             ln_input = input_tensor
@@ -236,20 +217,16 @@ class BertAttention(nn.Module):
             self_output = self.self(
                 ln_output,
                 attention_mask,
-                pruning_threshold=pruning_threshold,
             )
         else:
             self_output = self.self(
                 input_tensor,
                 attention_mask,
-                pruning_threshold=pruning_threshold,
             )
-        output_pruning_threshold = pruning_threshold
 
         attention_output = self.output(
             self_output,
             input_tensor,
-            pruning_threshold=output_pruning_threshold,
         )
         return attention_output
 
@@ -265,25 +242,15 @@ class BertIntermediate(nn.Module):
             gather_output=False,
             stride=1,
             init_method=normal_init_method(
-                mean=0.0, std=config.initializer_range),
-            pruning_method=config.pruning_method if config.pruning_module
-            in ['all', 'encoder', 'encoder_ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+                mean=0.0, std=config.initializer_range))
         self.intermediate_act_fn = ACT2FN[config.hidden_act] \
             if isinstance(config.hidden_act, str) else config.hidden_act
 
     def forward(
         self,
         hidden_states,
-        pruning_threshold=None,
     ):
-        hidden_states = self.dense(
-            hidden_states,
-            pruning_threshold=pruning_threshold,
-        )
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
 
@@ -306,13 +273,7 @@ class BertOutput(nn.Module):
             bias=True,
             input_is_parallel=True,
             stride=1,
-            init_method=init_method,
-            pruning_method=config.pruning_method if config.pruning_module
-            in ['all', 'encoder', 'encoder_ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank)
+            init_method=init_method)
         self.fp32_layernorm = config.fp32_layernorm
         if not config.pre_ln:
             self.LayerNorm = BertLayerNorm(
@@ -325,12 +286,8 @@ class BertOutput(nn.Module):
         self,
         hidden_states,
         input_tensor,
-        pruning_threshold=None,
     ):
-        hidden_states = self.dense(
-            hidden_states,
-            pruning_threshold=pruning_threshold,
-        )
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         ln_input = hidden_states + input_tensor
         if self.LayerNorm is not None:
@@ -359,14 +316,8 @@ class BertLayer(nn.Module):
         else:
             self.LayerNorm = None
 
-    def forward(
-        self,
-        hidden_states,
-        attention_mask,
-        pruning_threshold=None,
-    ):
-        attention_output = self.attention(
-            hidden_states, attention_mask, pruning_threshold=pruning_threshold)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
         if self.LayerNorm is not None:
             ln_input = attention_output
             previous_type = attention_output.type()
@@ -375,15 +326,10 @@ class BertLayer(nn.Module):
             ln_output = self.LayerNorm(ln_input)
             if self.fp32_layernorm:
                 ln_output = ln_output.type(previous_type)
-            intermediate_output = self.intermediate(
-                ln_output, pruning_threshold=pruning_threshold)
+            intermediate_output = self.intermediate(ln_output)
         else:
-            intermediate_output = self.intermediate(
-                attention_output, pruning_threshold=pruning_threshold)
-        layer_output = self.output(
-            intermediate_output,
-            attention_output,
-            pruning_threshold=pruning_threshold)
+            intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
         return layer_output
 
 
@@ -407,7 +353,6 @@ class BertEncoder(nn.Module):
         output_all_encoded_layers=True,
         checkpoint_activations=False,
         detach_index=-1,
-        pruning_threshold=None,
     ):
         all_encoder_layers = []
 
@@ -417,8 +362,7 @@ class BertEncoder(nn.Module):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
                 for layer in layers:
-                    x_ = layer(
-                        x_, inputs[1], pruning_threshold=pruning_threshold)
+                    x_ = layer(x_, inputs[1])
                 return x_
 
             return custom_forward
@@ -654,7 +598,6 @@ class BertModel(PreTrainedBertModel):
         output_all_encoded_layers=True,
         checkpoint_activations=False,
         detach_index=-1,
-        pruning_threshold=None,
     ):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -683,8 +626,7 @@ class BertModel(PreTrainedBertModel):
             extended_attention_mask,
             output_all_encoded_layers=output_all_encoded_layers,
             checkpoint_activations=checkpoint_activations,
-            detach_index=detach_index,
-            pruning_threshold=pruning_threshold)
+            detach_index=detach_index)
         sequence_output = encoded_layers[-1]
         for p in self.pooler.parameters():
             if p is None:
@@ -709,18 +651,6 @@ class DecodeLayer(nn.Module):
             std=config.initializer_range,
             num_layers=config.num_hidden_layers)
 
-        self_pruning_method = config.pruning_method
-        cross_pruning_method = config.pruning_method
-        ffn_pruning_method = config.pruning_method
-
-        if config.ft_module is not None:
-            if 'decoder_self' in config.ft_module:
-                self_pruning_method = 'finetune'
-            if 'decoder_cross' in config.ft_module:
-                cross_pruning_method = 'finetune'
-            if 'decoder_ffn' in config.ft_module:
-                ffn_pruning_method = 'finetune'
-
         self.attention = mpu.GPT2ParallelSelfAttention(
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
@@ -728,13 +658,6 @@ class DecodeLayer(nn.Module):
             output_dropout_prob=config.hidden_dropout_prob,
             init_method=init_method,
             output_layer_init_method=output_layer_init_method,
-            pruning_method=self_pruning_method if config.pruning_module in [
-                'all', 'decoder', 'decoder_self', 'decoder_self+ffn'
-            ] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
 
         self.cross_attention = mpu.PalmParallelCrossAttention(
@@ -745,12 +668,6 @@ class DecodeLayer(nn.Module):
             init_method=init_method,
             attn_separate=False,
             output_layer_init_method=output_layer_init_method,
-            pruning_method=cross_pruning_method,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            pruning_module=config.pruning_module,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
 
         self.input_layernorm = BertLayerNorm(
@@ -765,12 +682,6 @@ class DecodeLayer(nn.Module):
             config.intermediate_size,
             gather_output=False,
             init_method=init_method,
-            pruning_method=ffn_pruning_method if config.pruning_module
-            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
         self.intermediate_act_fn = ACT2FN[config.hidden_act] \
             if isinstance(config.hidden_act, str) else config.hidden_act
@@ -779,12 +690,6 @@ class DecodeLayer(nn.Module):
             config.hidden_size,
             input_is_parallel=True,
             init_method=output_layer_init_method,
-            pruning_method=ffn_pruning_method if config.pruning_module
-            in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None,
-            pruning_mask_init=config.pruning_mask_init,
-            pruning_mask_scale=config.pruning_mask_scale,
-            LR_weight_rank=config.LR_weight_rank,
-            LR_mask_rank=config.LR_mask_rank,
         )
 
         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
@@ -804,8 +709,7 @@ class DecodeLayer(nn.Module):
                 enc_hidden_states,
                 enc_attn_mask,
                 dec_attn_mask,
-                is_infer=False,
-                pruning_threshold=None):
+                is_infer=False):
         residual = hidden_states
         previous_type = hidden_states.type()
         hidden_states = self.input_layernorm(
@@ -813,10 +717,7 @@ class DecodeLayer(nn.Module):
         if self.fp32_layernorm:
             hidden_states = hidden_states.type(previous_type)
         hidden_states = self.attention(
-            hidden_states,
-            dec_attn_mask,
-            is_infer=is_infer,
-            pruning_threshold=pruning_threshold)
+            hidden_states, dec_attn_mask, is_infer=is_infer)
 
         hidden_states = residual + hidden_states
 
@@ -825,23 +726,18 @@ class DecodeLayer(nn.Module):
             self.type_converter(hidden_states))
         if self.fp32_layernorm:
             hidden_states = hidden_states.type(previous_type)
-        hidden_states = self.cross_attention(
-            hidden_states,
-            enc_hidden_states,
-            enc_attn_mask,
-            pruning_threshold=pruning_threshold)
+        hidden_states = self.cross_attention(hidden_states, enc_hidden_states,
+                                             enc_attn_mask)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.post_cross_attention_layernorm(
             self.type_converter(hidden_states))
         if self.fp32_layernorm:
             hidden_states = hidden_states.type(previous_type)
-        hidden_states = self.intermediate(
-            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.intermediate(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
-        hidden_states = self.output(
-            hidden_states, pruning_threshold=pruning_threshold)
+        hidden_states = self.output(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = residual + hidden_states
 
@@ -866,8 +762,7 @@ class BertDecoder(nn.Module):
                 dec_attn_mask,
                 checkpoint_activations=False,
                 output_all_encoded_layers=False,
-                is_infer=False,
-                pruning_threshold=None):
+                is_infer=False):
 
         def custom(start, end):
 
@@ -880,8 +775,7 @@ class BertDecoder(nn.Module):
                         inputs[1],
                         inputs[2],
                         dec_attn_mask * 1,
-                        is_infer=is_infer,
-                        pruning_threshold=pruning_threshold)
+                        is_infer=is_infer)
                 return x_
 
             return custom_forward
@@ -904,8 +798,7 @@ class BertDecoder(nn.Module):
                     enc_hidden_states,
                     enc_attn_mask,
                     dec_attn_mask,
-                    is_infer=is_infer,
-                    pruning_threshold=pruning_threshold)
+                    is_infer=is_infer)
 
         previous_type = hidden_states.type()
         if self.fp32_layernorm:
@@ -932,8 +825,7 @@ class DecodeModel(PreTrainedBertModel):
                 enc_attn_mask=None,
                 dec_attn_mask=None,
                 checkpoint_activations=False,
-                is_infer=False,
-                pruning_threshold=None):
+                is_infer=False):
         extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2)
         extended_attention_mask = extended_attention_mask.to(
             dtype=next(self.decoder.parameters()).dtype)  # fp16 compatibility
@@ -946,8 +838,7 @@ class DecodeModel(PreTrainedBertModel):
             extended_attention_mask,
             dec_attn_mask,
             checkpoint_activations=False,
-            is_infer=is_infer,
-            pruning_threshold=pruning_threshold)
+            is_infer=is_infer)
         return sequence_output[-1]
 
 
@@ -972,16 +863,14 @@ class PalmForPreTraining(PreTrainedBertModel):
                 checkpoint_activations=False,
                 is_infer=False,
                 sequence_output=None,
-                parallel_output=True,
-                pruning_threshold=None):
+                parallel_output=True):
         if sequence_output is None:
             sequence_output, pooled_output = self.bert(
                 input_ids,
                 token_type_ids,
                 attention_mask,
                 output_all_encoded_layers=False,
-                checkpoint_activations=checkpoint_activations,
-                pruning_threshold=pruning_threshold)
+                checkpoint_activations=checkpoint_activations)
             prediction_scores, seq_relationship_score = self.cls(
                 sequence_output, pooled_output)
         else:
@@ -998,8 +887,7 @@ class PalmForPreTraining(PreTrainedBertModel):
             attention_mask,
             decode_attention_mask,
             checkpoint_activations=checkpoint_activations,
-            is_infer=is_infer,
-            pruning_threshold=pruning_threshold)
+            is_infer=is_infer)
 
         transformer_output_parallel = mpu.copy_to_model_parallel_region(
             decode_output)
@@ -1017,6 +905,29 @@ class PalmForPreTraining(PreTrainedBertModel):
 
 
 class PlugModel(torch.nn.Module):
+    """
+    The bare Plug Model transformer outputting raw hidden-states without any specific head on top.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PlugNLGConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~DistributedPlug.initialize_model`] method to load the model weights.
+    Example:
+
+    ```python
+    >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given
+    >>> # here only initializes a slice of the model on a single GPU.
+    >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model.
+    >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel
+
+    >>> # Initializing a Plug configuration
+    >>> configuration = PlugNLGConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = PlugModel(configuration)
+    """
 
     def __init__(self, config):
         super(PlugModel, self).__init__()
@@ -1034,6 +945,58 @@ class PlugModel(torch.nn.Module):
                 is_infer=False,
                 sequence_output=None,
                 parallel_output=True):
+        """
+        Parameters:
+            input_tokens (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`):
+                `input_tokens_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+                Indices can be obtained using transformers [`BertTokenizer`]. See
+                [`TextGenerationPreprocessor.__call__`] for details.
+            token_type_ids (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`, *optional*, defaults to
+            None):
+               Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+               1]`:
+
+               - 0 corresponds to a *sentence A* token,
+               - 1 corresponds to a *sentence B* token.
+
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            target_tokens (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
+                Target token ids(labels) for language modeling. Note that the labels **are shifted** inside the model,
+                i.e. you can set `target_tokens = input_tokens` Indices are selected in
+                `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only
+                computed for labels in `[0, ..., config.vocab_size]`
+
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.max_position_embeddings - 1]`.
+
+            decode_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults
+            to None):
+                Mask to avoid performing attention on padding token indices of target tokens. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            checkpoint_activations (`boolean`, *optional*, defaults to `False`):
+                Whether gradient checkpointing is activated for this model or not.
+            is_infer (`boolean`, *optional*, defaults to `False`):
+                Whether or not to perform single inference.
+            sequence_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*,
+            defaults to None):
+                Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the
+                model. A single forward() call can produce one single token. To generate the current token, the
+                sequence_output generated by the `forward()` of the previous token is required.
+            parallel_output (`boolean`, *optional*, defaults to `True`):
+                To parallel return output, or gather it before return.
+
+
+        """
         return self.model(
             input_tokens,
             token_type_ids,

From 69da8f91ac5ca420408100c4ec5abd0c5987e65a Mon Sep 17 00:00:00 2001
From: "ashui.cbh" <ashui.cbh@alibaba-inc.com>
Date: Tue, 11 Oct 2022 20:49:13 +0800
Subject: [PATCH 018/129] [to #42322933]suport image inpainting         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10111615

---
 .../image_inpainting/image_inpainting.png     |   3 +
 .../image_inpainting_mask.png                 |   3 +
 modelscope/metainfo.py                        |   5 +
 modelscope/metrics/__init__.py                |   2 +
 modelscope/metrics/builder.py                 |   2 +
 modelscope/metrics/image_inpainting_metric.py | 210 +++++++
 modelscope/models/cv/__init__.py              |  17 +-
 .../models/cv/crowd_counting/cc_model.py      |   2 +
 .../cv/crowd_counting/hrnet_aspp_relu.py      |  14 +-
 .../models/cv/image_inpainting/__init__.py    |  22 +
 modelscope/models/cv/image_inpainting/base.py |  75 +++
 .../models/cv/image_inpainting/default.py     | 210 +++++++
 .../models/cv/image_inpainting/model.py       |  36 ++
 .../cv/image_inpainting/modules/__init__.py   |   0
 .../modules/ade20k/__init__.py                |   2 +
 .../image_inpainting/modules/ade20k/base.py   | 380 +++++++++++
 .../image_inpainting/modules/ade20k/resnet.py | 183 ++++++
 .../image_inpainting/modules/adversarial.py   | 167 +++++
 .../modules/feature_matching.py               |  45 ++
 .../models/cv/image_inpainting/modules/ffc.py | 588 ++++++++++++++++++
 .../cv/image_inpainting/modules/inception.py  | 324 ++++++++++
 .../cv/image_inpainting/modules/perceptual.py |  47 ++
 .../cv/image_inpainting/modules/pix2pixhd.py  |  75 +++
 .../models/cv/image_inpainting/refinement.py  | 393 ++++++++++++
 .../msdatasets/task_datasets/__init__.py      |   2 +
 .../image_inpainting/__init__.py              |   2 +
 .../task_datasets/image_inpainting/aug.py     | 100 +++
 .../image_inpainting_dataset.py               | 337 ++++++++++
 modelscope/outputs.py                         |   1 +
 modelscope/pipelines/builder.py               |   2 +
 modelscope/pipelines/cv/__init__.py           |   2 +
 .../pipelines/cv/image_inpainting_pipeline.py | 146 +++++
 modelscope/trainers/__init__.py               |   5 +-
 modelscope/trainers/cv/__init__.py            |   4 +-
 .../trainers/cv/image_inpainting_trainer.py   | 111 ++++
 modelscope/utils/constant.py                  |   4 +-
 requirements/cv.txt                           |   2 +
 tests/pipelines/test_image_inpainting.py      |  77 +++
 tests/run_config.yaml                         |   1 +
 .../trainers/test_image_inpainting_trainer.py |  84 +++
 40 files changed, 3666 insertions(+), 19 deletions(-)
 create mode 100644 data/test/images/image_inpainting/image_inpainting.png
 create mode 100644 data/test/images/image_inpainting/image_inpainting_mask.png
 create mode 100644 modelscope/metrics/image_inpainting_metric.py
 create mode 100644 modelscope/models/cv/image_inpainting/__init__.py
 create mode 100644 modelscope/models/cv/image_inpainting/base.py
 create mode 100644 modelscope/models/cv/image_inpainting/default.py
 create mode 100644 modelscope/models/cv/image_inpainting/model.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/__init__.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/base.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/adversarial.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/feature_matching.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/ffc.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/inception.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/perceptual.py
 create mode 100644 modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
 create mode 100644 modelscope/models/cv/image_inpainting/refinement.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/__init__.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/aug.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py
 create mode 100644 modelscope/pipelines/cv/image_inpainting_pipeline.py
 create mode 100644 modelscope/trainers/cv/image_inpainting_trainer.py
 create mode 100644 tests/pipelines/test_image_inpainting.py
 create mode 100644 tests/trainers/test_image_inpainting_trainer.py

diff --git a/data/test/images/image_inpainting/image_inpainting.png b/data/test/images/image_inpainting/image_inpainting.png
new file mode 100644
index 00000000..e141012d
--- /dev/null
+++ b/data/test/images/image_inpainting/image_inpainting.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5
+size 2747938
diff --git a/data/test/images/image_inpainting/image_inpainting_mask.png b/data/test/images/image_inpainting/image_inpainting_mask.png
new file mode 100644
index 00000000..e30f67e7
--- /dev/null
+++ b/data/test/images/image_inpainting/image_inpainting_mask.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15
+size 27616
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 77627abc..cae9d188 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -27,6 +27,7 @@ class Models(object):
     face_2d_keypoints = 'face-2d-keypoints'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     image_reid_person = 'passvitb'
+    image_inpainting = 'FFTInpainting'
     video_summarization = 'pgl-video-summarization'
     swinL_semantic_segmentation = 'swinL-semantic-segmentation'
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
@@ -179,6 +180,7 @@ class Pipelines(object):
     video_summarization = 'googlenet_pgl_video_summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_reid_person = 'passvitb-image-reid-person'
+    image_inpainting = 'fft-inpainting'
     text_driven_segmentation = 'text-driven-segmentation'
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
@@ -264,6 +266,7 @@ class Trainers(object):
     image_portrait_enhancement = 'image-portrait-enhancement'
     video_summarization = 'video-summarization'
     movie_scene_segmentation = 'movie-scene-segmentation'
+    image_inpainting = 'image-inpainting'
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -363,6 +366,8 @@ class Metrics(object):
     video_summarization_metric = 'video-summarization-metric'
     # metric for movie-scene-segmentation task
     movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
+    # metric for inpainting task
+    image_inpainting_metric = 'image-inpainting-metric'
 
 
 class Optimizers(object):
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index d3975a2c..e6a03a22 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
     from .token_classification_metric import TokenClassificationMetric
     from .video_summarization_metric import VideoSummarizationMetric
     from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
+    from .image_inpainting_metric import ImageInpaintingMetric
 
 else:
     _import_structure = {
@@ -34,6 +35,7 @@ else:
         'token_classification_metric': ['TokenClassificationMetric'],
         'video_summarization_metric': ['VideoSummarizationMetric'],
         'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
+        'image_inpainting_metric': ['ImageInpaintingMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 9e875cc4..ee4d2840 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -18,6 +18,7 @@ class MetricKeys(object):
     SSIM = 'ssim'
     AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
+    FID = 'fid'
     BLEU_1 = 'bleu-1'
     BLEU_4 = 'bleu-4'
     ROUGE_1 = 'rouge-1'
@@ -39,6 +40,7 @@ task_default_metrics = {
     Tasks.image_captioning: [Metrics.text_gen_metric],
     Tasks.visual_question_answering: [Metrics.text_gen_metric],
     Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
+    Tasks.image_inpainting: [Metrics.image_inpainting_metric],
 }
 
 
diff --git a/modelscope/metrics/image_inpainting_metric.py b/modelscope/metrics/image_inpainting_metric.py
new file mode 100644
index 00000000..954d4ca2
--- /dev/null
+++ b/modelscope/metrics/image_inpainting_metric.py
@@ -0,0 +1,210 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy import linalg
+
+from modelscope.metainfo import Metrics
+from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3
+from modelscope.utils.registry import default_group
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+def fid_calculate_activation_statistics(act):
+    mu = np.mean(act, axis=0)
+    sigma = np.cov(act, rowvar=False)
+    return mu, sigma
+
+
+def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6):
+    mu1, sigma1 = fid_calculate_activation_statistics(activations_pred)
+    mu2, sigma2 = fid_calculate_activation_statistics(activations_target)
+
+    diff = mu1 - mu2
+
+    # Product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # Numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        # if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2)
+            - 2 * tr_covmean)
+
+
+class FIDScore(torch.nn.Module):
+
+    def __init__(self, dims=2048, eps=1e-6):
+        super().__init__()
+        if getattr(FIDScore, '_MODEL', None) is None:
+            block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+            FIDScore._MODEL = InceptionV3([block_idx]).eval()
+        self.model = FIDScore._MODEL
+        self.eps = eps
+        self.reset()
+
+    def forward(self, pred_batch, target_batch, mask=None):
+        activations_pred = self._get_activations(pred_batch)
+        activations_target = self._get_activations(target_batch)
+
+        self.activations_pred.append(activations_pred.detach().cpu())
+        self.activations_target.append(activations_target.detach().cpu())
+
+    def get_value(self):
+        activations_pred, activations_target = (self.activations_pred,
+                                                self.activations_target)
+        activations_pred = torch.cat(activations_pred).cpu().numpy()
+        activations_target = torch.cat(activations_target).cpu().numpy()
+
+        total_distance = calculate_frechet_distance(
+            activations_pred, activations_target, eps=self.eps)
+
+        self.reset()
+        return total_distance
+
+    def reset(self):
+        self.activations_pred = []
+        self.activations_target = []
+
+    def _get_activations(self, batch):
+        activations = self.model(batch)[0]
+        if activations.shape[2] != 1 or activations.shape[3] != 1:
+            assert False, \
+                'We should not have got here, because Inception always scales inputs to 299x299'
+        activations = activations.squeeze(-1).squeeze(-1)
+        return activations
+
+
+class SSIM(torch.nn.Module):
+    """SSIM. Modified from:
+    https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
+    """
+
+    def __init__(self, window_size=11, size_average=True):
+        super().__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.register_buffer('window',
+                             self._create_window(window_size, self.channel))
+
+    def forward(self, img1, img2):
+        assert len(img1.shape) == 4
+
+        channel = img1.size()[1]
+
+        if channel == self.channel and self.window.data.type(
+        ) == img1.data.type():
+            window = self.window
+        else:
+            window = self._create_window(self.window_size, channel)
+
+            window = window.type_as(img1)
+
+            self.window = window
+            self.channel = channel
+
+        return self._ssim(img1, img2, window, self.window_size, channel,
+                          self.size_average)
+
+    def _gaussian(self, window_size, sigma):
+        gauss = torch.Tensor([
+            np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2))
+            for x in range(window_size)
+        ])
+        return gauss / gauss.sum()
+
+    def _create_window(self, window_size, channel):
+        _1D_window = self._gaussian(window_size, 1.5).unsqueeze(1)
+        _2D_window = _1D_window.mm(
+            _1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+        return _2D_window.expand(channel, 1, window_size,
+                                 window_size).contiguous()
+
+    def _ssim(self,
+              img1,
+              img2,
+              window,
+              window_size,
+              channel,
+              size_average=True):
+        mu1 = F.conv2d(
+            img1, window, padding=(window_size // 2), groups=channel)
+        mu2 = F.conv2d(
+            img2, window, padding=(window_size // 2), groups=channel)
+
+        mu1_sq = mu1.pow(2)
+        mu2_sq = mu2.pow(2)
+        mu1_mu2 = mu1 * mu2
+
+        sigma1_sq = F.conv2d(
+            img1 * img1, window, padding=(window_size // 2),
+            groups=channel) - mu1_sq
+        sigma2_sq = F.conv2d(
+            img2 * img2, window, padding=(window_size // 2),
+            groups=channel) - mu2_sq
+        sigma12 = F.conv2d(
+            img1 * img2, window, padding=(window_size // 2),
+            groups=channel) - mu1_mu2
+
+        C1 = 0.01**2
+        C2 = 0.03**2
+
+        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \
+                   ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+        if size_average:
+            return ssim_map.mean()
+
+        return ssim_map.mean(1).mean(1).mean(1)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        return
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.image_inpainting_metric)
+class ImageInpaintingMetric(Metric):
+    """The metric computation class for image inpainting classes.
+    """
+
+    def __init__(self):
+        self.preds = []
+        self.targets = []
+        self.SSIM = SSIM(window_size=11, size_average=False).eval()
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.FID = FIDScore().to(device)
+
+    def add(self, outputs: Dict, inputs: Dict):
+        pred = outputs['inpainted']
+        target = inputs['image']
+        self.preds.append(torch_nested_detach(pred))
+        self.targets.append(torch_nested_detach(target))
+
+    def evaluate(self):
+        ssim_list = []
+        for (pred, target) in zip(self.preds, self.targets):
+            ssim_list.append(self.SSIM(pred, target))
+            self.FID(pred, target)
+        ssim_list = torch_nested_numpify(ssim_list)
+        fid = self.FID.get_value()
+        return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid}
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index f2798b59..ba7b03c5 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -5,13 +5,14 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
                crowd_counting, face_2d_keypoints, face_detection,
                face_generation, image_classification, image_color_enhance,
-               image_colorization, image_denoise, image_instance_segmentation,
-               image_panoptic_segmentation, image_portrait_enhancement,
-               image_reid_person, image_semantic_segmentation,
-               image_to_image_generation, image_to_image_translation,
-               movie_scene_segmentation, object_detection,
-               product_retrieval_embedding, realtime_object_detection,
-               salient_detection, shop_segmentation, super_resolution,
-               video_single_object_tracking, video_summarization, virual_tryon)
+               image_colorization, image_denoise, image_inpainting,
+               image_instance_segmentation, image_panoptic_segmentation,
+               image_portrait_enhancement, image_reid_person,
+               image_semantic_segmentation, image_to_image_generation,
+               image_to_image_translation, movie_scene_segmentation,
+               object_detection, product_retrieval_embedding,
+               realtime_object_detection, salient_detection, shop_segmentation,
+               super_resolution, video_single_object_tracking,
+               video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/crowd_counting/cc_model.py b/modelscope/models/cv/crowd_counting/cc_model.py
index 582b26f4..16fbc261 100644
--- a/modelscope/models/cv/crowd_counting/cc_model.py
+++ b/modelscope/models/cv/crowd_counting/cc_model.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, Optional, Union
 
diff --git a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
index 982ba939..0d1bd3ca 100644
--- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
+++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
@@ -1,10 +1,10 @@
-# ------------------------------------------------------------------------------
-# Copyright (c) Microsoft
-# Licensed under the MIT License.
-# Written by Bin Xiao (Bin.Xiao@microsoft.com)
-# Modified by Ke Sun (sunk@mail.ustc.edu.cn)
-# https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
-# ------------------------------------------------------------------------------
+"""
+Copyright (c) Microsoft
+Licensed under the MIT License.
+Written by Bin Xiao (Bin.Xiao@microsoft.com)
+Modified by Ke Sun (sunk@mail.ustc.edu.cn)
+https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
+"""
 
 import functools
 import logging
diff --git a/modelscope/models/cv/image_inpainting/__init__.py b/modelscope/models/cv/image_inpainting/__init__.py
new file mode 100644
index 00000000..e7c63cd4
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .model import FFTInpainting
+
+else:
+    _import_structure = {
+        'model': ['FFTInpainting'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_inpainting/base.py b/modelscope/models/cv/image_inpainting/base.py
new file mode 100644
index 00000000..04e73630
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/base.py
@@ -0,0 +1,75 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import Dict, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.utils.logger import get_logger
+from .modules.adversarial import NonSaturatingWithR1
+from .modules.ffc import FFCResNetGenerator
+from .modules.perceptual import ResNetPL
+from .modules.pix2pixhd import NLayerDiscriminator
+
+LOGGER = get_logger()
+
+
+class BaseInpaintingTrainingModule(nn.Module):
+
+    def __init__(self,
+                 model_dir='',
+                 use_ddp=True,
+                 predict_only=False,
+                 visualize_each_iters=100,
+                 average_generator=False,
+                 generator_avg_beta=0.999,
+                 average_generator_start_step=30000,
+                 average_generator_period=10,
+                 store_discr_outputs_for_vis=False,
+                 **kwargs):
+        super().__init__()
+        LOGGER.info(
+            f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}'
+        )
+
+        self.generator = FFCResNetGenerator()
+        self.use_ddp = use_ddp
+
+        if not predict_only:
+            self.discriminator = NLayerDiscriminator()
+            self.adversarial_loss = NonSaturatingWithR1(
+                weight=10,
+                gp_coef=0.001,
+                mask_as_fake_target=True,
+                allow_scale_mask=True)
+
+            self.average_generator = average_generator
+            self.generator_avg_beta = generator_avg_beta
+            self.average_generator_start_step = average_generator_start_step
+            self.average_generator_period = average_generator_period
+            self.generator_average = None
+            self.last_generator_averaging_step = -1
+            self.store_discr_outputs_for_vis = store_discr_outputs_for_vis
+
+            self.loss_l1 = nn.L1Loss(reduction='none')
+
+            self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir)
+
+        self.visualize_each_iters = visualize_each_iters
+        LOGGER.info('BaseInpaintingTrainingModule init done')
+
+    def forward(self, batch: Dict[str,
+                                  torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys"""
+        raise NotImplementedError()
+
+    def generator_loss(self,
+                       batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
+
+    def discriminator_loss(
+            self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
diff --git a/modelscope/models/cv/image_inpainting/default.py b/modelscope/models/cv/image_inpainting/default.py
new file mode 100644
index 00000000..5f57d63f
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/default.py
@@ -0,0 +1,210 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import bisect
+
+import torch
+import torch.nn.functional as F
+
+from modelscope.utils.logger import get_logger
+from .base import BaseInpaintingTrainingModule
+from .modules.feature_matching import feature_matching_loss, masked_l1_loss
+
+LOGGER = get_logger()
+
+
+def set_requires_grad(module, value):
+    for param in module.parameters():
+        param.requires_grad = value
+
+
+def add_prefix_to_keys(dct, prefix):
+    return {prefix + k: v for k, v in dct.items()}
+
+
+class LinearRamp:
+
+    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
+        self.start_value = start_value
+        self.end_value = end_value
+        self.start_iter = start_iter
+        self.end_iter = end_iter
+
+    def __call__(self, i):
+        if i < self.start_iter:
+            return self.start_value
+        if i >= self.end_iter:
+            return self.end_value
+        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
+        return self.start_value * (1 - part) + self.end_value * part
+
+
+class LadderRamp:
+
+    def __init__(self, start_iters, values):
+        self.start_iters = start_iters
+        self.values = values
+        assert len(values) == len(start_iters) + 1, (len(values),
+                                                     len(start_iters))
+
+    def __call__(self, i):
+        segment_i = bisect.bisect_right(self.start_iters, i)
+        return self.values[segment_i]
+
+
+def get_ramp(kind='ladder', **kwargs):
+    if kind == 'linear':
+        return LinearRamp(**kwargs)
+    if kind == 'ladder':
+        return LadderRamp(**kwargs)
+    raise ValueError(f'Unexpected ramp kind: {kind}')
+
+
+class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):
+
+    def __init__(self,
+                 model_dir='',
+                 predict_only=False,
+                 concat_mask=True,
+                 rescale_scheduler_kwargs=None,
+                 image_to_discriminator='predicted_image',
+                 add_noise_kwargs=None,
+                 noise_fill_hole=False,
+                 const_area_crop_kwargs=None,
+                 distance_weighter_kwargs=None,
+                 distance_weighted_mask_for_discr=False,
+                 fake_fakes_proba=0,
+                 fake_fakes_generator_kwargs=None,
+                 **kwargs):
+        super().__init__(model_dir=model_dir, predict_only=predict_only)
+        self.concat_mask = concat_mask
+        self.rescale_size_getter = get_ramp(
+            **rescale_scheduler_kwargs
+        ) if rescale_scheduler_kwargs is not None else None
+        self.image_to_discriminator = image_to_discriminator
+        self.add_noise_kwargs = add_noise_kwargs
+        self.noise_fill_hole = noise_fill_hole
+        self.const_area_crop_kwargs = const_area_crop_kwargs
+        self.refine_mask_for_losses = None
+        self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr
+
+        self.feature_matching_weight = 100
+        self.losses_l1_weight_known = 10
+        self.losses_l1_weight_missing = 0
+        self.fake_fakes_proba = fake_fakes_proba
+
+    def forward(self, batch):
+        img = batch['image']
+        mask = batch['mask']
+
+        masked_img = img * (1 - mask)
+
+        if self.concat_mask:
+            masked_img = torch.cat([masked_img, mask], dim=1)
+
+        batch['predicted_image'] = self.generator(masked_img)
+        batch['inpainted'] = mask * batch['predicted_image'] + (
+            1 - mask) * batch['image']
+
+        batch['mask_for_losses'] = mask
+
+        return batch
+
+    def generator_loss(self, batch):
+        img = batch['image']
+        predicted_img = batch[self.image_to_discriminator]
+        original_mask = batch['mask']
+        supervised_mask = batch['mask_for_losses']
+
+        # L1
+        l1_value = masked_l1_loss(predicted_img, img, supervised_mask,
+                                  self.losses_l1_weight_known,
+                                  self.losses_l1_weight_missing)
+
+        total_loss = l1_value
+        metrics = dict(gen_l1=l1_value)
+
+        # discriminator
+        # adversarial_loss calls backward by itself
+        mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask
+        self.adversarial_loss.pre_generator_step(
+            real_batch=img,
+            fake_batch=predicted_img,
+            generator=self.generator,
+            discriminator=self.discriminator)
+        discr_real_pred, discr_real_features = self.discriminator(img)
+        discr_fake_pred, discr_fake_features = self.discriminator(
+            predicted_img)
+        adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(
+            real_batch=img,
+            fake_batch=predicted_img,
+            discr_real_pred=discr_real_pred,
+            discr_fake_pred=discr_fake_pred,
+            mask=mask_for_discr)
+        total_loss = total_loss + adv_gen_loss
+        metrics['gen_adv'] = adv_gen_loss
+        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
+
+        # feature matching
+        if self.feature_matching_weight > 0:
+            need_mask_in_fm = False
+            mask_for_fm = supervised_mask if need_mask_in_fm else None
+            fm_value = feature_matching_loss(
+                discr_fake_features, discr_real_features,
+                mask=mask_for_fm) * self.feature_matching_weight
+            total_loss = total_loss + fm_value
+            metrics['gen_fm'] = fm_value
+
+        if self.loss_resnet_pl is not None:
+            resnet_pl_value = self.loss_resnet_pl(predicted_img, img)
+            total_loss = total_loss + resnet_pl_value
+            metrics['gen_resnet_pl'] = resnet_pl_value
+
+        return total_loss, metrics
+
+    def discriminator_loss(self, batch):
+        total_loss = 0
+        metrics = {}
+
+        predicted_img = batch[self.image_to_discriminator].detach()
+        self.adversarial_loss.pre_discriminator_step(
+            real_batch=batch['image'],
+            fake_batch=predicted_img,
+            generator=self.generator,
+            discriminator=self.discriminator)
+        discr_real_pred, discr_real_features = self.discriminator(
+            batch['image'])
+        discr_fake_pred, discr_fake_features = self.discriminator(
+            predicted_img)
+        adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(
+            real_batch=batch['image'],
+            fake_batch=predicted_img,
+            discr_real_pred=discr_real_pred,
+            discr_fake_pred=discr_fake_pred,
+            mask=batch['mask'])
+
+        total_loss = (total_loss + adv_discr_loss) * 0.1
+        metrics['discr_adv'] = adv_discr_loss
+        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
+
+        return total_loss, metrics
+
+    def _do_step(self, batch, optimizer_idx=None):
+        if optimizer_idx == 0:  # step for generator
+            set_requires_grad(self.generator, True)
+            set_requires_grad(self.discriminator, False)
+        elif optimizer_idx == 1:  # step for discriminator
+            set_requires_grad(self.generator, False)
+            set_requires_grad(self.discriminator, True)
+
+        batch = self(batch)
+        total_loss = 0
+        if optimizer_idx is None or optimizer_idx == 0:  # step for generator
+            total_loss, metrics = self.generator_loss(batch)
+
+        elif optimizer_idx is None or optimizer_idx == 1:  # step for discriminator
+            total_loss, metrics = self.discriminator_loss(batch)
+
+        result = dict(loss=total_loss)
+        return result
diff --git a/modelscope/models/cv/image_inpainting/model.py b/modelscope/models/cv/image_inpainting/model.py
new file mode 100644
index 00000000..b12f6edd
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/model.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+LOGGER = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.image_inpainting, module_name=Models.image_inpainting)
+class FFTInpainting(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        super().__init__(model_dir, **kwargs)
+
+        from .default import DefaultInpaintingTrainingModule
+        pretrained = kwargs.get('pretrained', True)
+        predict_only = kwargs.get('predict_only', False)
+        net = DefaultInpaintingTrainingModule(
+            model_dir=model_dir, predict_only=predict_only)
+        if pretrained:
+            path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+            LOGGER.info(f'loading pretrained model from {path}')
+            state = torch.load(path, map_location='cpu')
+            net.load_state_dict(state, strict=False)
+        self.model = net
+
+    def forward(self, inputs):
+        return self.model(inputs)
diff --git a/modelscope/models/cv/image_inpainting/modules/__init__.py b/modelscope/models/cv/image_inpainting/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py b/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py
new file mode 100644
index 00000000..89c3e293
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .base import ModelBuilder
diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/base.py b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py
new file mode 100644
index 00000000..02bd3cc4
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py
@@ -0,0 +1,380 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules import BatchNorm2d
+
+from . import resnet
+
+NUM_CLASS = 150
+
+
+# Model Builder
+class ModelBuilder:
+    # custom weights initialization
+    @staticmethod
+    def weights_init(m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    @staticmethod
+    def build_encoder(arch='resnet50dilated',
+                      fc_dim=512,
+                      weights='',
+                      model_dir=''):
+        pretrained = True if len(weights) == 0 else False
+        arch = arch.lower()
+        if arch == 'resnet50dilated':
+            orig_resnet = resnet.__dict__['resnet50'](
+                pretrained=pretrained, model_dir=model_dir)
+            net_encoder = ResnetDilated(orig_resnet, dilate_scale=8)
+        elif arch == 'resnet50':
+            orig_resnet = resnet.__dict__['resnet50'](
+                pretrained=pretrained, model_dir=model_dir)
+            net_encoder = Resnet(orig_resnet)
+        else:
+            raise Exception('Architecture undefined!')
+
+        # encoders are usually pretrained
+        # net_encoder.apply(ModelBuilder.weights_init)
+        if len(weights) > 0:
+            print('Loading weights for net_encoder')
+            net_encoder.load_state_dict(
+                torch.load(weights, map_location=lambda storage, loc: storage),
+                strict=False)
+        return net_encoder
+
+    @staticmethod
+    def build_decoder(arch='ppm_deepsup',
+                      fc_dim=512,
+                      num_class=NUM_CLASS,
+                      weights='',
+                      use_softmax=False,
+                      drop_last_conv=False):
+        arch = arch.lower()
+        if arch == 'ppm_deepsup':
+            net_decoder = PPMDeepsup(
+                num_class=num_class,
+                fc_dim=fc_dim,
+                use_softmax=use_softmax,
+                drop_last_conv=drop_last_conv)
+        elif arch == 'c1_deepsup':
+            net_decoder = C1DeepSup(
+                num_class=num_class,
+                fc_dim=fc_dim,
+                use_softmax=use_softmax,
+                drop_last_conv=drop_last_conv)
+        else:
+            raise Exception('Architecture undefined!')
+
+        net_decoder.apply(ModelBuilder.weights_init)
+        if len(weights) > 0:
+            print('Loading weights for net_decoder')
+            net_decoder.load_state_dict(
+                torch.load(weights, map_location=lambda storage, loc: storage),
+                strict=False)
+        return net_decoder
+
+    @staticmethod
+    def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim,
+                    drop_last_conv, *arts, **kwargs):
+        path = os.path.join(
+            weights_path, 'ade20k',
+            f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth')
+        return ModelBuilder.build_decoder(
+            arch=arch_decoder,
+            fc_dim=fc_dim,
+            weights=path,
+            use_softmax=True,
+            drop_last_conv=drop_last_conv)
+
+    @staticmethod
+    def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim,
+                    segmentation, *arts, **kwargs):
+        if segmentation:
+            path = os.path.join(
+                weights_path, 'ade20k',
+                f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth')
+        else:
+            path = ''
+        return ModelBuilder.build_encoder(
+            arch=arch_encoder,
+            fc_dim=fc_dim,
+            weights=path,
+            model_dir=weights_path)
+
+
+def conv3x3_bn_relu(in_planes, out_planes, stride=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False),
+        BatchNorm2d(out_planes),
+        nn.ReLU(inplace=True),
+    )
+
+
+# pyramid pooling, deep supervision
+class PPMDeepsup(nn.Module):
+
+    def __init__(self,
+                 num_class=NUM_CLASS,
+                 fc_dim=4096,
+                 use_softmax=False,
+                 pool_scales=(1, 2, 3, 6),
+                 drop_last_conv=False):
+        super().__init__()
+        self.use_softmax = use_softmax
+        self.drop_last_conv = drop_last_conv
+
+        self.ppm = []
+        for scale in pool_scales:
+            self.ppm.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(scale),
+                    nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False),
+                    BatchNorm2d(512), nn.ReLU(inplace=True)))
+        self.ppm = nn.ModuleList(self.ppm)
+        self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
+
+        self.conv_last = nn.Sequential(
+            nn.Conv2d(
+                fc_dim + len(pool_scales) * 512,
+                512,
+                kernel_size=3,
+                padding=1,
+                bias=False), BatchNorm2d(512), nn.ReLU(inplace=True),
+            nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1))
+        self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+        self.dropout_deepsup = nn.Dropout2d(0.1)
+
+    def forward(self, conv_out, segSize=None):
+        conv5 = conv_out[-1]
+
+        input_size = conv5.size()
+        ppm_out = [conv5]
+        for pool_scale in self.ppm:
+            ppm_out.append(
+                nn.functional.interpolate(
+                    pool_scale(conv5), (input_size[2], input_size[3]),
+                    mode='bilinear',
+                    align_corners=False))
+        ppm_out = torch.cat(ppm_out, 1)
+
+        if self.drop_last_conv:
+            return ppm_out
+        else:
+            x = self.conv_last(ppm_out)
+
+            if self.use_softmax:  # is True during inference
+                x = nn.functional.interpolate(
+                    x, size=segSize, mode='bilinear', align_corners=False)
+                x = nn.functional.softmax(x, dim=1)
+                return x
+
+            # deep sup
+            conv4 = conv_out[-2]
+            _ = self.cbr_deepsup(conv4)
+            _ = self.dropout_deepsup(_)
+            _ = self.conv_last_deepsup(_)
+
+            x = nn.functional.log_softmax(x, dim=1)
+            _ = nn.functional.log_softmax(_, dim=1)
+
+            return (x, _)
+
+
+class Resnet(nn.Module):
+
+    def __init__(self, orig_resnet):
+        super(Resnet, self).__init__()
+
+        # take pretrained resnet, except AvgPool and FC
+        self.conv1 = orig_resnet.conv1
+        self.bn1 = orig_resnet.bn1
+        self.relu1 = orig_resnet.relu1
+        self.conv2 = orig_resnet.conv2
+        self.bn2 = orig_resnet.bn2
+        self.relu2 = orig_resnet.relu2
+        self.conv3 = orig_resnet.conv3
+        self.bn3 = orig_resnet.bn3
+        self.relu3 = orig_resnet.relu3
+        self.maxpool = orig_resnet.maxpool
+        self.layer1 = orig_resnet.layer1
+        self.layer2 = orig_resnet.layer2
+        self.layer3 = orig_resnet.layer3
+        self.layer4 = orig_resnet.layer4
+
+    def forward(self, x, return_feature_maps=False):
+        conv_out = []
+
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        conv_out.append(x)
+        x = self.layer2(x)
+        conv_out.append(x)
+        x = self.layer3(x)
+        conv_out.append(x)
+        x = self.layer4(x)
+        conv_out.append(x)
+
+        if return_feature_maps:
+            return conv_out
+        return [x]
+
+
+# Resnet Dilated
+class ResnetDilated(nn.Module):
+
+    def __init__(self, orig_resnet, dilate_scale=8):
+        super().__init__()
+        from functools import partial
+
+        if dilate_scale == 8:
+            orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2))
+            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4))
+        elif dilate_scale == 16:
+            orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2))
+
+        # take pretrained resnet, except AvgPool and FC
+        self.conv1 = orig_resnet.conv1
+        self.bn1 = orig_resnet.bn1
+        self.relu1 = orig_resnet.relu1
+        self.conv2 = orig_resnet.conv2
+        self.bn2 = orig_resnet.bn2
+        self.relu2 = orig_resnet.relu2
+        self.conv3 = orig_resnet.conv3
+        self.bn3 = orig_resnet.bn3
+        self.relu3 = orig_resnet.relu3
+        self.maxpool = orig_resnet.maxpool
+        self.layer1 = orig_resnet.layer1
+        self.layer2 = orig_resnet.layer2
+        self.layer3 = orig_resnet.layer3
+        self.layer4 = orig_resnet.layer4
+
+    def _nostride_dilate(self, m, dilate):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            # the convolution with stride
+            if m.stride == (2, 2):
+                m.stride = (1, 1)
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate // 2, dilate // 2)
+                    m.padding = (dilate // 2, dilate // 2)
+            # other convoluions
+            else:
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate, dilate)
+                    m.padding = (dilate, dilate)
+
+    def forward(self, x, return_feature_maps=False):
+        conv_out = []
+
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        conv_out.append(x)
+        x = self.layer2(x)
+        conv_out.append(x)
+        x = self.layer3(x)
+        conv_out.append(x)
+        x = self.layer4(x)
+        conv_out.append(x)
+
+        if return_feature_maps:
+            return conv_out
+        return [x]
+
+
+# last conv, deep supervision
+class C1DeepSup(nn.Module):
+
+    def __init__(self,
+                 num_class=150,
+                 fc_dim=2048,
+                 use_softmax=False,
+                 drop_last_conv=False):
+        super(C1DeepSup, self).__init__()
+        self.use_softmax = use_softmax
+        self.drop_last_conv = drop_last_conv
+
+        self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
+        self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
+
+        # last conv
+        self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+        self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+
+    def forward(self, conv_out, segSize=None):
+        conv5 = conv_out[-1]
+
+        x = self.cbr(conv5)
+
+        if self.drop_last_conv:
+            return x
+        else:
+            x = self.conv_last(x)
+
+            if self.use_softmax:  # is True during inference
+                x = nn.functional.interpolate(
+                    x, size=segSize, mode='bilinear', align_corners=False)
+                x = nn.functional.softmax(x, dim=1)
+                return x
+
+            # deep sup
+            conv4 = conv_out[-2]
+            _ = self.cbr_deepsup(conv4)
+            _ = self.conv_last_deepsup(_)
+
+            x = nn.functional.log_softmax(x, dim=1)
+            _ = nn.functional.log_softmax(_, dim=1)
+
+            return (x, _)
+
+
+# last conv
+class C1(nn.Module):
+
+    def __init__(self, num_class=150, fc_dim=2048, use_softmax=False):
+        super(C1, self).__init__()
+        self.use_softmax = use_softmax
+
+        self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
+
+        # last conv
+        self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
+
+    def forward(self, conv_out, segSize=None):
+        conv5 = conv_out[-1]
+        x = self.cbr(conv5)
+        x = self.conv_last(x)
+
+        if self.use_softmax:  # is True during inference
+            x = nn.functional.interpolate(
+                x, size=segSize, mode='bilinear', align_corners=False)
+            x = nn.functional.softmax(x, dim=1)
+        else:
+            x = nn.functional.log_softmax(x, dim=1)
+
+        return x
diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
new file mode 100644
index 00000000..7da9ff07
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
@@ -0,0 +1,183 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import math
+import os
+
+import torch
+import torch.nn as nn
+from torch.nn import BatchNorm2d
+
+__all__ = ['ResNet', 'resnet50']
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    '3x3 convolution with padding'
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def resnet50(pretrained=False, model_dir='', **kwargs):
+    """Constructs a ResNet-50 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth')
+        model.load_state_dict(
+            torch.load(cached_file, map_location='cpu'), strict=False)
+    return model
diff --git a/modelscope/models/cv/image_inpainting/modules/adversarial.py b/modelscope/models/cv/image_inpainting/modules/adversarial.py
new file mode 100644
index 00000000..b183876b
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/adversarial.py
@@ -0,0 +1,167 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BaseAdversarialLoss:
+
+    def pre_generator_step(self, real_batch: torch.Tensor,
+                           fake_batch: torch.Tensor, generator: nn.Module,
+                           discriminator: nn.Module):
+        """
+        Prepare for generator step
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param generator:
+        :param discriminator:
+        :return: None
+        """
+
+    def pre_discriminator_step(self, real_batch: torch.Tensor,
+                               fake_batch: torch.Tensor, generator: nn.Module,
+                               discriminator: nn.Module):
+        """
+        Prepare for discriminator step
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param generator:
+        :param discriminator:
+        :return: None
+        """
+
+    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                       mask: Optional[torch.Tensor] = None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Calculate generator loss
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param discr_real_pred: Tensor, discriminator output for real_batch
+        :param discr_fake_pred: Tensor, discriminator output for fake_batch
+        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
+        :return: total generator loss along with some values that might be interesting to log
+        """
+        raise NotImplementedError
+
+    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                           mask: Optional[torch.Tensor] = None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Calculate discriminator loss and call .backward() on it
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param discr_real_pred: Tensor, discriminator output for real_batch
+        :param discr_fake_pred: Tensor, discriminator output for fake_batch
+        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
+        :return: total discriminator loss along with some values that might be interesting to log
+        """
+        raise NotImplementedError
+
+    def interpolate_mask(self, mask, shape):
+        assert mask is not None
+        assert self.allow_scale_mask or shape == mask.shape[-2:]
+        if shape != mask.shape[-2:] and self.allow_scale_mask:
+            if self.mask_scale_mode == 'maxpool':
+                mask = F.adaptive_max_pool2d(mask, shape)
+            else:
+                mask = F.interpolate(
+                    mask, size=shape, mode=self.mask_scale_mode)
+        return mask
+
+
+def make_r1_gp(discr_real_pred, real_batch):
+    if torch.is_grad_enabled():
+        grad_real = torch.autograd.grad(
+            outputs=discr_real_pred.sum(),
+            inputs=real_batch,
+            create_graph=True)[0]
+        grad_penalty = (grad_real.view(grad_real.shape[0],
+                                       -1).norm(2, dim=1)**2).mean()
+    else:
+        grad_penalty = 0
+    real_batch.requires_grad = False
+
+    return grad_penalty
+
+
+class NonSaturatingWithR1(BaseAdversarialLoss):
+
+    def __init__(self,
+                 gp_coef=5,
+                 weight=1,
+                 mask_as_fake_target=False,
+                 allow_scale_mask=False,
+                 mask_scale_mode='nearest',
+                 extra_mask_weight_for_gen=0,
+                 use_unmasked_for_gen=True,
+                 use_unmasked_for_discr=True):
+        self.gp_coef = gp_coef
+        self.weight = weight
+        # use for discr => use for gen;
+        # otherwise we teach only the discr to pay attention to very small difference
+        assert use_unmasked_for_gen or (not use_unmasked_for_discr)
+        # mask as target => use unmasked for discr:
+        # if we don't care about unmasked regions at all
+        # then it doesn't matter if the value of mask_as_fake_target is true or false
+        assert use_unmasked_for_discr or (not mask_as_fake_target)
+        self.use_unmasked_for_gen = use_unmasked_for_gen
+        self.use_unmasked_for_discr = use_unmasked_for_discr
+        self.mask_as_fake_target = mask_as_fake_target
+        self.allow_scale_mask = allow_scale_mask
+        self.mask_scale_mode = mask_scale_mode
+        self.extra_mask_weight_for_gen = extra_mask_weight_for_gen
+
+    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                       mask=None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        fake_loss = F.softplus(-discr_fake_pred)
+        if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \
+                not self.use_unmasked_for_gen:  # == if masked region should be treated differently
+            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
+            if not self.use_unmasked_for_gen:
+                fake_loss = fake_loss * mask
+            else:
+                pixel_weights = 1 + mask * self.extra_mask_weight_for_gen
+                fake_loss = fake_loss * pixel_weights
+
+        return fake_loss.mean() * self.weight, dict()
+
+    def pre_discriminator_step(self, real_batch: torch.Tensor,
+                               fake_batch: torch.Tensor, generator: nn.Module,
+                               discriminator: nn.Module):
+        real_batch.requires_grad = True
+
+    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                           mask=None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+
+        real_loss = F.softplus(-discr_real_pred)
+        grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef
+        fake_loss = F.softplus(discr_fake_pred)
+
+        if not self.use_unmasked_for_discr or self.mask_as_fake_target:
+            # == if masked region should be treated differently
+            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
+            # use_unmasked_for_discr=False only makes sense for fakes;
+            # for reals there is no difference beetween two regions
+            fake_loss = fake_loss * mask
+            if self.mask_as_fake_target:
+                fake_loss = fake_loss + (1
+                                         - mask) * F.softplus(-discr_fake_pred)
+
+        sum_discr_loss = real_loss + grad_penalty + fake_loss
+        metrics = dict(
+            discr_real_out=discr_real_pred.mean(),
+            discr_fake_out=discr_fake_pred.mean(),
+            discr_real_gp=grad_penalty)
+        return sum_discr_loss.mean(), metrics
diff --git a/modelscope/models/cv/image_inpainting/modules/feature_matching.py b/modelscope/models/cv/image_inpainting/modules/feature_matching.py
new file mode 100644
index 00000000..c2effb20
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/feature_matching.py
@@ -0,0 +1,45 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+from typing import List
+
+import torch
+import torch.nn.functional as F
+
+
+def masked_l2_loss(pred, target, mask, weight_known, weight_missing):
+    per_pixel_l2 = F.mse_loss(pred, target, reduction='none')
+    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
+    return (pixel_weights * per_pixel_l2).mean()
+
+
+def masked_l1_loss(pred, target, mask, weight_known, weight_missing):
+    per_pixel_l1 = F.l1_loss(pred, target, reduction='none')
+    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
+    return (pixel_weights * per_pixel_l1).mean()
+
+
+def feature_matching_loss(fake_features: List[torch.Tensor],
+                          target_features: List[torch.Tensor],
+                          mask=None):
+    if mask is None:
+        res = torch.stack([
+            F.mse_loss(fake_feat, target_feat)
+            for fake_feat, target_feat in zip(fake_features, target_features)
+        ]).mean()
+    else:
+        res = 0
+        norm = 0
+        for fake_feat, target_feat in zip(fake_features, target_features):
+            cur_mask = F.interpolate(
+                mask,
+                size=fake_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            error_weights = 1 - cur_mask
+            cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean()
+            res = res + cur_val
+            norm += 1
+        res = res / norm
+    return res
diff --git a/modelscope/models/cv/image_inpainting/modules/ffc.py b/modelscope/models/cv/image_inpainting/modules/ffc.py
new file mode 100644
index 00000000..c74425e3
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/ffc.py
@@ -0,0 +1,588 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from kornia.geometry.transform import rotate
+
+
+def get_activation(kind='tanh'):
+    if kind == 'tanh':
+        return nn.Tanh()
+    if kind == 'sigmoid':
+        return nn.Sigmoid()
+    if kind is False:
+        return nn.Identity()
+    raise ValueError(f'Unknown activation kind {kind}')
+
+
+class SELayer(nn.Module):
+
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        res = x * y.expand_as(x)
+        return res
+
+
+class FourierUnit(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=1,
+                 spatial_scale_factor=None,
+                 spatial_scale_mode='bilinear',
+                 spectral_pos_encoding=False,
+                 use_se=False,
+                 se_kwargs=None,
+                 ffc3d=False,
+                 fft_norm='ortho'):
+        # bn_layer not used
+        super(FourierUnit, self).__init__()
+        self.groups = groups
+
+        self.conv_layer = torch.nn.Conv2d(
+            in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
+            out_channels=out_channels * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=self.groups,
+            bias=False)
+        self.bn = torch.nn.BatchNorm2d(out_channels * 2)
+        self.relu = torch.nn.ReLU(inplace=True)
+
+        # squeeze and excitation block
+        self.use_se = use_se
+        if use_se:
+            if se_kwargs is None:
+                se_kwargs = {}
+            self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
+
+        self.spatial_scale_factor = spatial_scale_factor
+        self.spatial_scale_mode = spatial_scale_mode
+        self.spectral_pos_encoding = spectral_pos_encoding
+        self.ffc3d = ffc3d
+        self.fft_norm = fft_norm
+
+    def forward(self, x):
+        batch = x.shape[0]
+
+        if self.spatial_scale_factor is not None:
+            orig_size = x.shape[-2:]
+            x = F.interpolate(
+                x,
+                scale_factor=self.spatial_scale_factor,
+                mode=self.spatial_scale_mode,
+                align_corners=False)
+
+        # (batch, c, h, w/2+1, 2)
+        fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
+        ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2,
+                              3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view((
+            batch,
+            -1,
+        ) + ffted.size()[3:])
+
+        if self.spectral_pos_encoding:
+            height, width = ffted.shape[-2:]
+            coords_vert = torch.linspace(0, 1,
+                                         height)[None, None, :, None].expand(
+                                             batch, 1, height, width).to(ffted)
+            coords_hor = torch.linspace(0, 1,
+                                        width)[None, None, None, :].expand(
+                                            batch, 1, height, width).to(ffted)
+            ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
+
+        if self.use_se:
+            ffted = self.se(ffted)
+
+        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
+        ffted = self.relu(self.bn(ffted))
+
+        ffted = ffted.view((
+            batch,
+            -1,
+            2,
+        ) + ffted.size()[2:]).permute(
+            0, 1, 3, 4, 2).contiguous()  # (batch,c, t, h, w/2+1, 2)
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+
+        ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
+        output = torch.fft.irfftn(
+            ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)
+
+        if self.spatial_scale_factor is not None:
+            output = F.interpolate(
+                output,
+                size=orig_size,
+                mode=self.spatial_scale_mode,
+                align_corners=False)
+
+        return output
+
+
+class SpectralTransform(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 groups=1,
+                 enable_lfu=True,
+                 **fu_kwargs):
+        # bn_layer not used
+        super(SpectralTransform, self).__init__()
+        self.enable_lfu = enable_lfu
+        if stride == 2:
+            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        else:
+            self.downsample = nn.Identity()
+
+        self.stride = stride
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels // 2,
+                kernel_size=1,
+                groups=groups,
+                bias=False), nn.BatchNorm2d(out_channels // 2),
+            nn.ReLU(inplace=True))
+        self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups,
+                              **fu_kwargs)
+        if self.enable_lfu:
+            self.lfu = FourierUnit(out_channels // 2, out_channels // 2,
+                                   groups)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels,
+            kernel_size=1,
+            groups=groups,
+            bias=False)
+
+    def forward(self, x):
+
+        x = self.downsample(x)
+        x = self.conv1(x)
+        output = self.fu(x)
+
+        if self.enable_lfu:
+            n, c, h, w = x.shape
+            split_no = 2
+            split_s = h // split_no
+            xs = torch.cat(
+                torch.split(x[:, :c // 4], split_s, dim=-2),
+                dim=1).contiguous()
+            xs = torch.cat(
+                torch.split(xs, split_s, dim=-1), dim=1).contiguous()
+            xs = self.lfu(xs)
+            xs = xs.repeat(1, 1, split_no, split_no).contiguous()
+        else:
+            xs = 0
+
+        output = self.conv2(x + output + xs)
+
+        return output
+
+
+class LearnableSpatialTransformWrapper(nn.Module):
+
+    def __init__(self,
+                 impl,
+                 pad_coef=0.5,
+                 angle_init_range=80,
+                 train_angle=True):
+        super().__init__()
+        self.impl = impl
+        self.angle = torch.rand(1) * angle_init_range
+        if train_angle:
+            self.angle = nn.Parameter(self.angle, requires_grad=True)
+        self.pad_coef = pad_coef
+
+    def forward(self, x):
+        if torch.is_tensor(x):
+            return self.inverse_transform(self.impl(self.transform(x)), x)
+        elif isinstance(x, tuple):
+            x_trans = tuple(self.transform(elem) for elem in x)
+            y_trans = self.impl(x_trans)
+            return tuple(
+                self.inverse_transform(elem, orig_x)
+                for elem, orig_x in zip(y_trans, x))
+        else:
+            raise ValueError(f'Unexpected input type {type(x)}')
+
+    def transform(self, x):
+        height, width = x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+        x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
+        x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
+        return x_padded_rotated
+
+    def inverse_transform(self, y_padded_rotated, orig_x):
+        height, width = orig_x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+
+        y_padded = rotate(
+            y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
+        y_height, y_width = y_padded.shape[2:]
+        y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w]
+        return y
+
+
+class FFC(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 ratio_gin,
+                 ratio_gout,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 enable_lfu=True,
+                 padding_type='reflect',
+                 gated=False,
+                 **spectral_kwargs):
+        super(FFC, self).__init__()
+
+        assert stride == 1 or stride == 2, 'Stride should be 1 or 2.'
+        self.stride = stride
+
+        in_cg = int(in_channels * ratio_gin)
+        in_cl = in_channels - in_cg
+        out_cg = int(out_channels * ratio_gout)
+        out_cl = out_channels - out_cg
+
+        self.ratio_gin = ratio_gin
+        self.ratio_gout = ratio_gout
+        self.global_in_num = in_cg
+
+        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
+        self.convl2l = module(
+            in_cl,
+            out_cl,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type)
+        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
+        self.convl2g = module(
+            in_cl,
+            out_cg,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
+        self.convg2l = module(
+            in_cg,
+            out_cl,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
+        self.convg2g = module(in_cg, out_cg, stride,
+                              1 if groups == 1 else groups // 2, enable_lfu,
+                              **spectral_kwargs)
+
+        self.gated = gated
+        module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
+        self.gate = module(in_channels, 2, 1)
+
+    def forward(self, x):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        out_xl, out_xg = 0, 0
+
+        if self.gated:
+            total_input_parts = [x_l]
+            if torch.is_tensor(x_g):
+                total_input_parts.append(x_g)
+            total_input = torch.cat(total_input_parts, dim=1)
+
+            gates = torch.sigmoid(self.gate(total_input))
+            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
+        else:
+            g2l_gate, l2g_gate = 1, 1
+
+        if self.ratio_gout != 1:
+            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
+        if self.ratio_gout != 0:
+            out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
+
+        return out_xl, out_xg
+
+
+class FFC_BN_ACT(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 ratio_gin,
+                 ratio_gout,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 norm_layer=nn.BatchNorm2d,
+                 activation_layer=nn.Identity,
+                 padding_type='reflect',
+                 enable_lfu=True,
+                 **kwargs):
+        super(FFC_BN_ACT, self).__init__()
+        self.ffc = FFC(
+            in_channels,
+            out_channels,
+            kernel_size,
+            ratio_gin,
+            ratio_gout,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            enable_lfu,
+            padding_type=padding_type,
+            **kwargs)
+        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
+        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
+        global_channels = int(out_channels * ratio_gout)
+        self.bn_l = lnorm(out_channels - global_channels)
+        self.bn_g = gnorm(global_channels)
+
+        lact = nn.Identity if ratio_gout == 1 else activation_layer
+        gact = nn.Identity if ratio_gout == 0 else activation_layer
+        self.act_l = lact(inplace=True)
+        self.act_g = gact(inplace=True)
+
+    def forward(self, x):
+        x_l, x_g = self.ffc(x)
+        x_l = self.act_l(self.bn_l(x_l))
+        x_g = self.act_g(self.bn_g(x_g))
+        return x_l, x_g
+
+
+class FFCResnetBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 padding_type,
+                 norm_layer,
+                 activation_layer=nn.ReLU,
+                 dilation=1,
+                 spatial_transform_kwargs=None,
+                 inline=False,
+                 **conv_kwargs):
+        super().__init__()
+        self.conv1 = FFC_BN_ACT(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer,
+            padding_type=padding_type,
+            **conv_kwargs)
+        self.conv2 = FFC_BN_ACT(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer,
+            padding_type=padding_type,
+            **conv_kwargs)
+        if spatial_transform_kwargs is not None:
+            self.conv1 = LearnableSpatialTransformWrapper(
+                self.conv1, **spatial_transform_kwargs)
+            self.conv2 = LearnableSpatialTransformWrapper(
+                self.conv2, **spatial_transform_kwargs)
+        self.inline = inline
+
+    def forward(self, x):
+        if self.inline:
+            x_l, x_g = x[:, :-self.conv1.ffc.
+                         global_in_num], x[:, -self.conv1.ffc.global_in_num:]
+        else:
+            x_l, x_g = x if type(x) is tuple else (x, 0)
+
+        id_l, id_g = x_l, x_g
+
+        x_l, x_g = self.conv1((x_l, x_g))
+        x_l, x_g = self.conv2((x_l, x_g))
+
+        x_l, x_g = id_l + x_l, id_g + x_g
+        out = x_l, x_g
+        if self.inline:
+            out = torch.cat(out, dim=1)
+        return out
+
+
+class ConcatTupleLayer(nn.Module):
+
+    def forward(self, x):
+        assert isinstance(x, tuple)
+        x_l, x_g = x
+        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
+        if not torch.is_tensor(x_g):
+            return x_l
+        return torch.cat(x, dim=1)
+
+
+class FFCResNetGenerator(nn.Module):
+
+    def __init__(self,
+                 input_nc=4,
+                 output_nc=3,
+                 ngf=64,
+                 n_downsampling=3,
+                 n_blocks=18,
+                 norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect',
+                 activation_layer=nn.ReLU,
+                 up_norm_layer=nn.BatchNorm2d,
+                 up_activation=nn.ReLU(True),
+                 init_conv_kwargs={
+                     'ratio_gin': 0,
+                     'ratio_gout': 0,
+                     'enable_lfu': False
+                 },
+                 downsample_conv_kwargs={
+                     'ratio_gin': 0,
+                     'ratio_gout': 0,
+                     'enable_lfu': False
+                 },
+                 resnet_conv_kwargs={
+                     'ratio_gin': 0.75,
+                     'ratio_gout': 0.75,
+                     'enable_lfu': False
+                 },
+                 spatial_transform_layers=None,
+                 spatial_transform_kwargs={},
+                 add_out_act='sigmoid',
+                 max_features=1024,
+                 out_ffc=False,
+                 out_ffc_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+
+        model = [
+            nn.ReflectionPad2d(3),
+            FFC_BN_ACT(
+                input_nc,
+                ngf,
+                kernel_size=7,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
+                **init_conv_kwargs)
+        ]
+
+        # downsample
+        for i in range(n_downsampling):
+            mult = 2**i
+            if i == n_downsampling - 1:
+                cur_conv_kwargs = dict(downsample_conv_kwargs)
+                cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get(
+                    'ratio_gin', 0)
+            else:
+                cur_conv_kwargs = downsample_conv_kwargs
+            model += [
+                FFC_BN_ACT(
+                    min(max_features, ngf * mult),
+                    min(max_features, ngf * mult * 2),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                    **cur_conv_kwargs)
+            ]
+
+        mult = 2**n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+
+        # resnet blocks
+        for i in range(n_blocks):
+            cur_resblock = FFCResnetBlock(
+                feats_num_bottleneck,
+                padding_type=padding_type,
+                activation_layer=activation_layer,
+                norm_layer=norm_layer,
+                **resnet_conv_kwargs)
+            if spatial_transform_layers is not None and i in spatial_transform_layers:
+                cur_resblock = LearnableSpatialTransformWrapper(
+                    cur_resblock, **spatial_transform_kwargs)
+            model += [cur_resblock]
+
+        model += [ConcatTupleLayer()]
+
+        # upsample
+        for i in range(n_downsampling):
+            mult = 2**(n_downsampling - i)
+            model += [
+                nn.ConvTranspose2d(
+                    min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    output_padding=1),
+                up_norm_layer(min(max_features, int(ngf * mult / 2))),
+                up_activation
+            ]
+
+        if out_ffc:
+            model += [
+                FFCResnetBlock(
+                    ngf,
+                    padding_type=padding_type,
+                    activation_layer=activation_layer,
+                    norm_layer=norm_layer,
+                    inline=True,
+                    **out_ffc_kwargs)
+            ]
+
+        model += [
+            nn.ReflectionPad2d(3),
+            nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)
+        ]
+        if add_out_act:
+            model.append(
+                get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+
+    def forward(self, input):
+        return self.model(input)
diff --git a/modelscope/models/cv/image_inpainting/modules/inception.py b/modelscope/models/cv/image_inpainting/modules/inception.py
new file mode 100644
index 00000000..5070533d
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/inception.py
@@ -0,0 +1,324 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+
+from modelscope.utils.logger import get_logger
+
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+# Inception weights ported to Pytorch from
+# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
+FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \
+                  'fid_weights/pt_inception-2015-12-05-6726825d.pth'
+
+LOGGER = get_logger()
+
+
+class InceptionV3(nn.Module):
+    """Pretrained InceptionV3 network returning feature maps"""
+
+    # Index of default block of inception to return,
+    # corresponds to output of final average pooling
+    DEFAULT_BLOCK_INDEX = 3
+
+    # Maps feature dimensionality to their output blocks indices
+    BLOCK_INDEX_BY_DIM = {
+        64: 0,  # First max pooling features
+        192: 1,  # Second max pooling featurs
+        768: 2,  # Pre-aux classifier features
+        2048: 3  # Final average pooling features
+    }
+
+    def __init__(self,
+                 output_blocks=[DEFAULT_BLOCK_INDEX],
+                 resize_input=True,
+                 normalize_input=True,
+                 requires_grad=False,
+                 use_fid_inception=True):
+        """Build pretrained InceptionV3
+
+        Parameters
+        ----------
+        output_blocks : list of int
+            Indices of blocks to return features of. Possible values are:
+                - 0: corresponds to output of first max pooling
+                - 1: corresponds to output of second max pooling
+                - 2: corresponds to output which is fed to aux classifier
+                - 3: corresponds to output of final average pooling
+        resize_input : bool
+            If true, bilinearly resizes input to width and height 299 before
+            feeding input to model. As the network without fully connected
+            layers is fully convolutional, it should be able to handle inputs
+            of arbitrary size, so resizing might not be strictly needed
+        normalize_input : bool
+            If true, scales the input from range (0, 1) to the range the
+            pretrained Inception network expects, namely (-1, 1)
+        requires_grad : bool
+            If true, parameters of the model require gradients. Possibly useful
+            for finetuning the network
+        use_fid_inception : bool
+            If true, uses the pretrained Inception model used in Tensorflow's
+            FID implementation. If false, uses the pretrained Inception model
+            available in torchvision. The FID Inception model has different
+            weights and a slightly different structure from torchvision's
+            Inception model. If you want to compute FID scores, you are
+            strongly advised to set this parameter to true to get comparable
+            results.
+        """
+        super(InceptionV3, self).__init__()
+
+        self.resize_input = resize_input
+        self.normalize_input = normalize_input
+        self.output_blocks = sorted(output_blocks)
+        self.last_needed_block = max(output_blocks)
+
+        assert self.last_needed_block <= 3, \
+            'Last possible output block index is 3'
+
+        self.blocks = nn.ModuleList()
+
+        if use_fid_inception:
+            inception = fid_inception_v3()
+        else:
+            inception = models.inception_v3(pretrained=True)
+
+        # Block 0: input to maxpool1
+        block0 = [
+            inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
+            inception.Conv2d_2b_3x3,
+            nn.MaxPool2d(kernel_size=3, stride=2)
+        ]
+        self.blocks.append(nn.Sequential(*block0))
+
+        # Block 1: maxpool1 to maxpool2
+        if self.last_needed_block >= 1:
+            block1 = [
+                inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
+                nn.MaxPool2d(kernel_size=3, stride=2)
+            ]
+            self.blocks.append(nn.Sequential(*block1))
+
+        # Block 2: maxpool2 to aux classifier
+        if self.last_needed_block >= 2:
+            block2 = [
+                inception.Mixed_5b,
+                inception.Mixed_5c,
+                inception.Mixed_5d,
+                inception.Mixed_6a,
+                inception.Mixed_6b,
+                inception.Mixed_6c,
+                inception.Mixed_6d,
+                inception.Mixed_6e,
+            ]
+            self.blocks.append(nn.Sequential(*block2))
+
+        # Block 3: aux classifier to final avgpool
+        if self.last_needed_block >= 3:
+            block3 = [
+                inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
+                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+            ]
+            self.blocks.append(nn.Sequential(*block3))
+
+        for param in self.parameters():
+            param.requires_grad = requires_grad
+
+    def forward(self, inp):
+        """Get Inception feature maps
+
+        Parameters
+        ----------
+        inp : torch.autograd.Variable
+            Input tensor of shape Bx3xHxW. Values are expected to be in
+            range (0, 1)
+
+        Returns
+        -------
+        List of torch.autograd.Variable, corresponding to the selected output
+        block, sorted ascending by index
+        """
+        outp = []
+        x = inp
+
+        if self.resize_input:
+            x = F.interpolate(
+                x, size=(299, 299), mode='bilinear', align_corners=False)
+
+        if self.normalize_input:
+            x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
+
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if idx in self.output_blocks:
+                outp.append(x)
+
+            if idx == self.last_needed_block:
+                break
+
+        return outp
+
+
+def fid_inception_v3():
+    """Build pretrained Inception model for FID computation
+
+    The Inception model for FID computation uses a different set of weights
+    and has a slightly different structure than torchvision's Inception.
+
+    This method first constructs torchvision's Inception and then patches the
+    necessary parts that are different in the FID Inception model.
+    """
+    LOGGER.info('fid_inception_v3 called')
+    inception = models.inception_v3(
+        num_classes=1008, aux_logits=False, pretrained=False)
+    LOGGER.info('models.inception_v3 done')
+    inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
+    inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
+    inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
+    inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
+    inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
+    inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
+    inception.Mixed_7b = FIDInceptionE_1(1280)
+    inception.Mixed_7c = FIDInceptionE_2(2048)
+
+    LOGGER.info('fid_inception_v3 patching done')
+
+    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    LOGGER.info('fid_inception_v3 weights downloaded')
+
+    inception.load_state_dict(state_dict)
+    LOGGER.info('fid_inception_v3 weights loaded into model')
+
+    return inception
+
+
+class FIDInceptionA(models.inception.InceptionA):
+    """InceptionA block patched for FID computation"""
+
+    def __init__(self, in_channels, pool_features):
+        super(FIDInceptionA, self).__init__(in_channels, pool_features)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionC(models.inception.InceptionC):
+    """InceptionC block patched for FID computation"""
+
+    def __init__(self, in_channels, channels_7x7):
+        super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionE_1(models.inception.InceptionE):
+    """First InceptionE block patched for FID computation"""
+
+    def __init__(self, in_channels):
+        super(FIDInceptionE_1, self).__init__(in_channels)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        # Patch: Tensorflow's average pool does not use the padded zero's in
+        # its average calculation
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
+
+
+class FIDInceptionE_2(models.inception.InceptionE):
+    """Second InceptionE block patched for FID computation"""
+
+    def __init__(self, in_channels):
+        super(FIDInceptionE_2, self).__init__(in_channels)
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = torch.cat(branch3x3, 1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = torch.cat(branch3x3dbl, 1)
+
+        # Patch: The FID Inception model uses max pooling instead of average
+        # pooling. This is likely an error in this specific Inception
+        # implementation, as other Inception models use average pooling here
+        # (which matches the description in the paper).
+        branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return torch.cat(outputs, 1)
diff --git a/modelscope/models/cv/image_inpainting/modules/perceptual.py b/modelscope/models/cv/image_inpainting/modules/perceptual.py
new file mode 100644
index 00000000..80fe2b96
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/perceptual.py
@@ -0,0 +1,47 @@
+"""
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+from .ade20k import ModelBuilder
+
+IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None]
+IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None]
+
+
+class ResNetPL(nn.Module):
+
+    def __init__(self,
+                 weight=1,
+                 weights_path=None,
+                 arch_encoder='resnet50dilated',
+                 segmentation=True):
+        super().__init__()
+        self.impl = ModelBuilder.get_encoder(
+            weights_path=weights_path,
+            arch_encoder=arch_encoder,
+            arch_decoder='ppm_deepsup',
+            fc_dim=2048,
+            segmentation=segmentation)
+        self.impl.eval()
+        for w in self.impl.parameters():
+            w.requires_grad_(False)
+
+        self.weight = weight
+
+    def forward(self, pred, target):
+        pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred)
+        target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target)
+
+        pred_feats = self.impl(pred, return_feature_maps=True)
+        target_feats = self.impl(target, return_feature_maps=True)
+
+        result = torch.stack([
+            F.mse_loss(cur_pred, cur_target)
+            for cur_pred, cur_target in zip(pred_feats, target_feats)
+        ]).sum() * self.weight
+        return result
diff --git a/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
new file mode 100644
index 00000000..32e18f3e
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
@@ -0,0 +1,75 @@
+"""
+The implementation is adopted from
+https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
+"""
+import collections
+import functools
+import logging
+from collections import defaultdict
+from functools import partial
+
+import numpy as np
+import torch.nn as nn
+
+
+# Defines the PatchGAN discriminator with the specified arguments.
+class NLayerDiscriminator(nn.Module):
+
+    def __init__(
+        self,
+        input_nc=3,
+        ndf=64,
+        n_layers=4,
+        norm_layer=nn.BatchNorm2d,
+    ):
+        super().__init__()
+        self.n_layers = n_layers
+
+        kw = 4
+        padw = int(np.ceil((kw - 1.0) / 2))
+        sequence = [[
+            nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+            nn.LeakyReLU(0.2, True)
+        ]]
+
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, 512)
+
+            cur_model = []
+            cur_model += [
+                nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
+                norm_layer(nf),
+                nn.LeakyReLU(0.2, True)
+            ]
+            sequence.append(cur_model)
+
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+
+        cur_model = []
+        cur_model += [
+            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
+            norm_layer(nf),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence.append(cur_model)
+
+        sequence += [[
+            nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)
+        ]]
+
+        for n in range(len(sequence)):
+            setattr(self, 'model' + str(n), nn.Sequential(*sequence[n]))
+
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        return act[-1], act[:-1]
diff --git a/modelscope/models/cv/image_inpainting/refinement.py b/modelscope/models/cv/image_inpainting/refinement.py
new file mode 100644
index 00000000..662d8a05
--- /dev/null
+++ b/modelscope/models/cv/image_inpainting/refinement.py
@@ -0,0 +1,393 @@
+'''
+Part of the implementation is borrowed and modified from LaMa, publicly available at
+https://github.com/saic-mdal/lama
+'''
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from kornia.filters import gaussian_blur2d
+from kornia.geometry.transform import resize
+from kornia.morphology import erosion
+from torch.nn import functional as F
+from torch.optim import SGD, Adam
+from tqdm import tqdm
+
+from .modules.ffc import FFCResnetBlock
+
+
+def move_to_device(obj, device):
+    if isinstance(obj, nn.Module):
+        return obj.to(device)
+    if torch.is_tensor(obj):
+        return obj.to(device)
+    if isinstance(obj, (tuple, list)):
+        return [move_to_device(el, device) for el in obj]
+    if isinstance(obj, dict):
+        return {name: move_to_device(val, device) for name, val in obj.items()}
+    raise ValueError(f'Unexpected type {type(obj)}')
+
+
+def ceil_modulo(x, mod):
+    if x % mod == 0:
+        return x
+    return (x // mod + 1) * mod
+
+
+def pad_tensor_to_modulo(img, mod):
+    batch_size, channels, height, width = img.shape
+    out_height = ceil_modulo(height, mod)
+    out_width = ceil_modulo(width, mod)
+    return F.pad(
+        img,
+        pad=(0, out_width - width, 0, out_height - height),
+        mode='reflect')
+
+
+def _pyrdown(im: torch.Tensor, downsize: tuple = None):
+    """downscale the image"""
+    if downsize is None:
+        downsize = (im.shape[2] // 2, im.shape[3] // 2)
+    assert im.shape[
+        1] == 3, 'Expected shape for the input to be (n,3,height,width)'
+    im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0))
+    im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False)
+    return im
+
+
+def _pyrdown_mask(mask: torch.Tensor,
+                  downsize: tuple = None,
+                  eps: float = 1e-8,
+                  blur_mask: bool = True,
+                  round_up: bool = True):
+    """downscale the mask tensor
+
+    Parameters
+    ----------
+    mask : torch.Tensor
+        mask of size (B, 1, H, W)
+    downsize : tuple, optional
+        size to downscale to. If None, image is downscaled to half, by default None
+    eps : float, optional
+        threshold value for binarizing the mask, by default 1e-8
+    blur_mask : bool, optional
+        if True, apply gaussian filter before downscaling, by default True
+    round_up : bool, optional
+        if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True
+
+    Returns
+    -------
+    torch.Tensor
+        downscaled mask
+    """
+
+    if downsize is None:
+        downsize = (mask.shape[2] // 2, mask.shape[3] // 2)
+    assert mask.shape[
+        1] == 1, 'Expected shape for the input to be (n,1,height,width)'
+    if blur_mask is True:
+        mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0))
+        mask = F.interpolate(
+            mask, size=downsize, mode='bilinear', align_corners=False)
+    else:
+        mask = F.interpolate(
+            mask, size=downsize, mode='bilinear', align_corners=False)
+    if round_up:
+        mask[mask >= eps] = 1
+        mask[mask < eps] = 0
+    else:
+        mask[mask >= 1.0 - eps] = 1
+        mask[mask < 1.0 - eps] = 0
+    return mask
+
+
+def _erode_mask(mask: torch.Tensor,
+                ekernel: torch.Tensor = None,
+                eps: float = 1e-8):
+    """erode the mask, and set gray pixels to 0"""
+    if ekernel is not None:
+        mask = erosion(mask, ekernel)
+        mask[mask >= 1.0 - eps] = 1
+        mask[mask < 1.0 - eps] = 0
+    return mask
+
+
+def _l1_loss(pred: torch.Tensor,
+             pred_downscaled: torch.Tensor,
+             ref: torch.Tensor,
+             mask: torch.Tensor,
+             mask_downscaled: torch.Tensor,
+             image: torch.Tensor,
+             on_pred: bool = True):
+    """l1 loss on src pixels, and downscaled predictions if on_pred=True"""
+    loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8]))
+    if on_pred:
+        loss += torch.mean(
+            torch.abs(pred_downscaled[mask_downscaled >= 1e-8]
+                      - ref[mask_downscaled >= 1e-8]))
+    return loss
+
+
+def _infer(image: torch.Tensor,
+           mask: torch.Tensor,
+           forward_front: nn.Module,
+           forward_rears: nn.Module,
+           ref_lower_res: torch.Tensor,
+           orig_shape: tuple,
+           devices: list,
+           scale_ind: int,
+           n_iters: int = 15,
+           lr: float = 0.002):
+    """Performs inference with refinement at a given scale.
+
+    Parameters
+    ----------
+    image : torch.Tensor
+        input image to be inpainted, of size (1,3,H,W)
+    mask : torch.Tensor
+        input inpainting mask, of size (1,1,H,W)
+    forward_front : nn.Module
+        the front part of the inpainting network
+    forward_rears : nn.Module
+        the rear part of the inpainting network
+    ref_lower_res : torch.Tensor
+        the inpainting at previous scale, used as reference image
+    orig_shape : tuple
+        shape of the original input image before padding
+    devices : list
+        list of available devices
+    scale_ind : int
+        the scale index
+    n_iters : int, optional
+        number of iterations of refinement, by default 15
+    lr : float, optional
+        learning rate, by default 0.002
+
+    Returns
+    -------
+    torch.Tensor
+        inpainted image
+    """
+    masked_image = image * (1 - mask)
+    masked_image = torch.cat([masked_image, mask], dim=1)
+
+    mask = mask.repeat(1, 3, 1, 1)
+    if ref_lower_res is not None:
+        ref_lower_res = ref_lower_res.detach()
+    with torch.no_grad():
+        z1, z2 = forward_front(masked_image)
+    # Inference
+    mask = mask.to(devices[-1])
+    ekernel = torch.from_numpy(
+        cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
+                                  (15, 15)).astype(bool)).float()
+    ekernel = ekernel.to(devices[-1])
+    image = image.to(devices[-1])
+    z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0])
+    z1.requires_grad, z2.requires_grad = True, True
+
+    optimizer = Adam([z1, z2], lr=lr)
+
+    pbar = tqdm(range(n_iters), leave=False)
+    for idi in pbar:
+        optimizer.zero_grad()
+        input_feat = (z1, z2)
+        for idd, forward_rear in enumerate(forward_rears):
+            output_feat = forward_rear(input_feat)
+            if idd < len(devices) - 1:
+                midz1, midz2 = output_feat
+                midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to(
+                    devices[idd + 1])
+                input_feat = (midz1, midz2)
+            else:
+                pred = output_feat
+
+        if ref_lower_res is None:
+            break
+        losses = {}
+        # scaled loss with downsampler
+        pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]])
+        mask_downscaled = _pyrdown_mask(
+            mask[:, :1, :orig_shape[0], :orig_shape[1]],
+            blur_mask=False,
+            round_up=False)
+        mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel)
+        mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1)
+        losses['ms_l1'] = _l1_loss(
+            pred,
+            pred_downscaled,
+            ref_lower_res,
+            mask,
+            mask_downscaled,
+            image,
+            on_pred=True)
+
+        loss = sum(losses.values())
+        pbar.set_description(
+            'Refining scale {} using scale {} ...current loss: {:.4f}'.format(
+                scale_ind + 1, scale_ind, loss.item()))
+        if idi < n_iters - 1:
+            loss.backward()
+            optimizer.step()
+            del pred_downscaled
+            del loss
+            del pred
+    # "pred" is the prediction after Plug-n-Play module
+    inpainted = mask * pred + (1 - mask) * image
+    inpainted = inpainted.detach().cpu()
+    return inpainted
+
+
+def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int,
+                            px_budget: int):
+    """Build the image mask pyramid
+
+    Parameters
+    ----------
+    batch : dict
+        batch containing image, mask, etc
+    min_side : int
+        minimum side length to limit the number of scales of the pyramid
+    max_scales : int
+        maximum number of scales allowed
+    px_budget : int
+        the product H*W cannot exceed this budget, because of resource constraints
+
+    Returns
+    -------
+    tuple
+        image-mask pyramid in the form of list of images and list of masks
+    """
+
+    assert batch['image'].shape[
+        0] == 1, 'refiner works on only batches of size 1!'
+
+    h, w = batch['unpad_to_size']
+    h, w = h[0].item(), w[0].item()
+
+    image = batch['image'][..., :h, :w]
+    mask = batch['mask'][..., :h, :w]
+    if h * w > px_budget:
+        # resize
+        ratio = np.sqrt(px_budget / float(h * w))
+        h_orig, w_orig = h, w
+        h, w = int(h * ratio), int(w * ratio)
+        print(
+            f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...'
+        )
+        image = resize(
+            image, (h, w), interpolation='bilinear', align_corners=False)
+        mask = resize(
+            mask, (h, w), interpolation='bilinear', align_corners=False)
+        mask[mask > 1e-8] = 1
+    breadth = min(h, w)
+    n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))),
+                   max_scales)
+    ls_images = []
+    ls_masks = []
+
+    ls_images.append(image)
+    ls_masks.append(mask)
+
+    for _ in range(n_scales - 1):
+        image_p = _pyrdown(ls_images[-1])
+        mask_p = _pyrdown_mask(ls_masks[-1])
+        ls_images.append(image_p)
+        ls_masks.append(mask_p)
+    # reverse the lists because we want the lowest resolution image as index 0
+    return ls_images[::-1], ls_masks[::-1]
+
+
+def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str,
+                   modulo: int, n_iters: int, lr: float, min_side: int,
+                   max_scales: int, px_budget: int):
+    """Refines the inpainting of the network
+
+    Parameters
+    ----------
+    batch : dict
+        image-mask batch, currently we assume the batchsize to be 1
+    inpainter : nn.Module
+        the inpainting neural network
+    gpu_ids : str
+        the GPU ids of the machine to use. If only single GPU, use: "0,"
+    modulo : int
+        pad the image to ensure dimension % modulo == 0
+    n_iters : int
+        number of iterations of refinement for each scale
+    lr : float
+        learning rate
+    min_side : int
+        all sides of image on all scales should be >= min_side / sqrt(2)
+    max_scales : int
+        max number of downscaling scales for the image-mask pyramid
+    px_budget : int
+        pixels budget. Any image will be resized to satisfy height*width <= px_budget
+
+    Returns
+    -------
+    torch.Tensor
+        inpainted image of size (1,3,H,W)
+    """
+    inpainter = inpainter.model
+    assert not inpainter.training
+    assert not inpainter.add_noise_kwargs
+    assert inpainter.concat_mask
+
+    gpu_ids = [
+        f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',')
+        if gpuid.isdigit()
+    ]
+    n_resnet_blocks = 0
+    first_resblock_ind = 0
+    found_first_resblock = False
+    for idl in range(len(inpainter.generator.model)):
+        if isinstance(inpainter.generator.model[idl], FFCResnetBlock):
+            n_resnet_blocks += 1
+            found_first_resblock = True
+        elif not found_first_resblock:
+            first_resblock_ind += 1
+    resblocks_per_gpu = n_resnet_blocks // len(gpu_ids)
+
+    devices = [torch.device(gpu_id) for gpu_id in gpu_ids]
+
+    # split the model into front, and rear parts
+    forward_front = inpainter.generator.model[0:first_resblock_ind]
+    forward_front.to(devices[0])
+    forward_rears = []
+    for idd in range(len(gpu_ids)):
+        if idd < len(gpu_ids) - 1:
+            forward_rears.append(
+                inpainter.generator.model[first_resblock_ind
+                                          + resblocks_per_gpu
+                                          * (idd):first_resblock_ind
+                                          + resblocks_per_gpu * (idd + 1)])
+        else:
+            forward_rears.append(
+                inpainter.generator.model[first_resblock_ind
+                                          + resblocks_per_gpu * (idd):])
+        forward_rears[idd].to(devices[idd])
+
+    ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales,
+                                                  px_budget)
+    image_inpainted = None
+
+    for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)):
+        orig_shape = image.shape[2:]
+        image = pad_tensor_to_modulo(image, modulo)
+        mask = pad_tensor_to_modulo(mask, modulo)
+        mask[mask >= 1e-8] = 1.0
+        mask[mask < 1e-8] = 0.0
+        image, mask = move_to_device(image, devices[0]), move_to_device(
+            mask, devices[0])
+        if image_inpainted is not None:
+            image_inpainted = move_to_device(image_inpainted, devices[-1])
+        image_inpainted = _infer(image, mask, forward_front, forward_rears,
+                                 image_inpainted, orig_shape, devices, ids,
+                                 n_iters, lr)
+        image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]]
+        # detach everything to save resources
+        image = image.detach().cpu()
+        mask = mask.detach().cpu()
+
+    return image_inpainted
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index e2bf5bc1..35c060f0 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -11,6 +11,7 @@ if TYPE_CHECKING:
     from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
     from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
+    from .image_inpainting import ImageInpaintingDataset
     from .passage_ranking_dataset import PassageRankingDataset
 
 else:
@@ -24,6 +25,7 @@ else:
         ['ImageInstanceSegmentationCocoDataset'],
         'video_summarization_dataset': ['VideoSummarizationDataset'],
         'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
+        'image_inpainting': ['ImageInpaintingDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py
new file mode 100644
index 00000000..732a1bd7
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .image_inpainting_dataset import ImageInpaintingDataset
diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/aug.py b/modelscope/msdatasets/task_datasets/image_inpainting/aug.py
new file mode 100644
index 00000000..445bb9b4
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_inpainting/aug.py
@@ -0,0 +1,100 @@
+"""
+The implementation is borrowed from LaMa,
+publicly available at https://github.com/saic-mdal/lama
+"""
+import imgaug.augmenters as iaa
+from albumentations import DualIAATransform, to_tuple
+
+
+class IAAAffine2(DualIAATransform):
+    """Place a regular grid of points on the input and randomly move the neighbourhood of these point around
+    via affine transformations.
+
+    Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+    Args:
+        p (float): probability of applying the transform. Default: 0.5.
+
+    Targets:
+        image, mask
+    """
+
+    def __init__(
+        self,
+        scale=(0.7, 1.3),
+        translate_percent=None,
+        translate_px=None,
+        rotate=0.0,
+        shear=(-0.1, 0.1),
+        order=1,
+        cval=0,
+        mode='reflect',
+        always_apply=False,
+        p=0.5,
+    ):
+        super(IAAAffine2, self).__init__(always_apply, p)
+        self.scale = dict(x=scale, y=scale)
+        self.translate_percent = to_tuple(translate_percent, 0)
+        self.translate_px = to_tuple(translate_px, 0)
+        self.rotate = to_tuple(rotate)
+        self.shear = dict(x=shear, y=shear)
+        self.order = order
+        self.cval = cval
+        self.mode = mode
+
+    @property
+    def processor(self):
+        return iaa.Affine(
+            self.scale,
+            self.translate_percent,
+            self.translate_px,
+            self.rotate,
+            self.shear,
+            self.order,
+            self.cval,
+            self.mode,
+        )
+
+    def get_transform_init_args_names(self):
+        return ('scale', 'translate_percent', 'translate_px', 'rotate',
+                'shear', 'order', 'cval', 'mode')
+
+
+class IAAPerspective2(DualIAATransform):
+    """Perform a random four point perspective transform of the input.
+
+    Note: This class introduce interpolation artifacts to mask if it has values other than {0;1}
+
+    Args:
+        scale ((float, float): standard deviation of the normal distributions. These are used to sample
+            the random distances of the subimage's corners from the full image's corners. Default: (0.05, 0.1).
+        p (float): probability of applying the transform. Default: 0.5.
+
+    Targets:
+        image, mask
+    """
+
+    def __init__(self,
+                 scale=(0.05, 0.1),
+                 keep_size=True,
+                 always_apply=False,
+                 p=0.5,
+                 order=1,
+                 cval=0,
+                 mode='replicate'):
+        super(IAAPerspective2, self).__init__(always_apply, p)
+        self.scale = to_tuple(scale, 1.0)
+        self.keep_size = keep_size
+        self.cval = cval
+        self.mode = mode
+
+    @property
+    def processor(self):
+        return iaa.PerspectiveTransform(
+            self.scale,
+            keep_size=self.keep_size,
+            mode=self.mode,
+            cval=self.cval)
+
+    def get_transform_init_args_names(self):
+        return ('scale', 'keep_size')
diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py b/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py
new file mode 100644
index 00000000..057b8f88
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py
@@ -0,0 +1,337 @@
+"""
+Part of the implementation is borrowed and modified from LaMa,
+publicly available at https://github.com/saic-mdal/lama
+"""
+import glob
+import os
+import os.path as osp
+from enum import Enum
+
+import albumentations as A
+import cv2
+import json
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .aug import IAAAffine2, IAAPerspective2
+
+LOGGER = get_logger()
+
+
+class LinearRamp:
+
+    def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
+        self.start_value = start_value
+        self.end_value = end_value
+        self.start_iter = start_iter
+        self.end_iter = end_iter
+
+    def __call__(self, i):
+        if i < self.start_iter:
+            return self.start_value
+        if i >= self.end_iter:
+            return self.end_value
+        part = (i - self.start_iter) / (self.end_iter - self.start_iter)
+        return self.start_value * (1 - part) + self.end_value * part
+
+
+class DrawMethod(Enum):
+    LINE = 'line'
+    CIRCLE = 'circle'
+    SQUARE = 'square'
+
+
+def make_random_superres_mask(shape,
+                              min_step=2,
+                              max_step=4,
+                              min_width=1,
+                              max_width=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    step_x = np.random.randint(min_step, max_step + 1)
+    width_x = np.random.randint(min_width, min(step_x, max_width + 1))
+    offset_x = np.random.randint(0, step_x)
+
+    step_y = np.random.randint(min_step, max_step + 1)
+    width_y = np.random.randint(min_width, min(step_y, max_width + 1))
+    offset_y = np.random.randint(0, step_y)
+
+    for dy in range(width_y):
+        mask[offset_y + dy::step_y] = 1
+    for dx in range(width_x):
+        mask[:, offset_x + dx::step_x] = 1
+    return mask[None, ...]
+
+
+class RandomSuperresMaskGenerator:
+
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+
+    def __call__(self, img, iter_i=None):
+        return make_random_superres_mask(img.shape[1:], **self.kwargs)
+
+
+def make_random_rectangle_mask(shape,
+                               margin=10,
+                               bbox_min_size=30,
+                               bbox_max_size=100,
+                               min_times=0,
+                               max_times=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        box_width = np.random.randint(bbox_min_size, bbox_max_size)
+        box_height = np.random.randint(bbox_min_size, bbox_max_size)
+        start_x = np.random.randint(margin, width - margin - box_width + 1)
+        start_y = np.random.randint(margin, height - margin - box_height + 1)
+        mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1
+    return mask[None, ...]
+
+
+class RandomRectangleMaskGenerator:
+
+    def __init__(self,
+                 margin=10,
+                 bbox_min_size=30,
+                 bbox_max_size=100,
+                 min_times=0,
+                 max_times=3,
+                 ramp_kwargs=None):
+        self.margin = margin
+        self.bbox_min_size = bbox_min_size
+        self.bbox_max_size = bbox_max_size
+        self.min_times = min_times
+        self.max_times = max_times
+        self.ramp = LinearRamp(
+            **ramp_kwargs) if ramp_kwargs is not None else None
+
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (
+            iter_i is not None) else 1
+        cur_bbox_max_size = int(self.bbox_min_size + 1
+                                + (self.bbox_max_size - self.bbox_min_size)
+                                * coef)
+        cur_max_times = int(self.min_times
+                            + (self.max_times - self.min_times) * coef)
+        return make_random_rectangle_mask(
+            img.shape[1:],
+            margin=self.margin,
+            bbox_min_size=self.bbox_min_size,
+            bbox_max_size=cur_bbox_max_size,
+            min_times=self.min_times,
+            max_times=cur_max_times)
+
+
+def make_random_irregular_mask(shape,
+                               max_angle=4,
+                               max_len=60,
+                               max_width=20,
+                               min_times=0,
+                               max_times=10,
+                               draw_method=DrawMethod.LINE):
+    draw_method = DrawMethod(draw_method)
+
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        start_x = np.random.randint(width)
+        start_y = np.random.randint(height)
+        for j in range(1 + np.random.randint(5)):
+            angle = 0.01 + np.random.randint(max_angle)
+            if i % 2 == 0:
+                angle = 2 * 3.1415926 - angle
+            length = 10 + np.random.randint(max_len)
+            brush_w = 5 + np.random.randint(max_width)
+            end_x = np.clip(
+                (start_x + length * np.sin(angle)).astype(np.int32), 0, width)
+            end_y = np.clip(
+                (start_y + length * np.cos(angle)).astype(np.int32), 0, height)
+            if draw_method == DrawMethod.LINE:
+                cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0,
+                         brush_w)
+            elif draw_method == DrawMethod.CIRCLE:
+                cv2.circle(
+                    mask, (start_x, start_y),
+                    radius=brush_w,
+                    color=1.,
+                    thickness=-1)
+            elif draw_method == DrawMethod.SQUARE:
+                radius = brush_w // 2
+                mask[start_y - radius:start_y + radius,
+                     start_x - radius:start_x + radius] = 1
+            start_x, start_y = end_x, end_y
+    return mask[None, ...]
+
+
+class RandomIrregularMaskGenerator:
+
+    def __init__(self,
+                 max_angle=4,
+                 max_len=60,
+                 max_width=20,
+                 min_times=0,
+                 max_times=10,
+                 ramp_kwargs=None,
+                 draw_method=DrawMethod.LINE):
+        self.max_angle = max_angle
+        self.max_len = max_len
+        self.max_width = max_width
+        self.min_times = min_times
+        self.max_times = max_times
+        self.draw_method = draw_method
+        self.ramp = LinearRamp(
+            **ramp_kwargs) if ramp_kwargs is not None else None
+
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (
+            iter_i is not None) else 1
+        cur_max_len = int(max(1, self.max_len * coef))
+        cur_max_width = int(max(1, self.max_width * coef))
+        cur_max_times = int(self.min_times + 1
+                            + (self.max_times - self.min_times) * coef)
+        return make_random_irregular_mask(
+            img.shape[1:],
+            max_angle=self.max_angle,
+            max_len=cur_max_len,
+            max_width=cur_max_width,
+            min_times=self.min_times,
+            max_times=cur_max_times,
+            draw_method=self.draw_method)
+
+
+class MixedMaskGenerator:
+
+    def __init__(self,
+                 irregular_proba=1 / 3,
+                 irregular_kwargs=None,
+                 box_proba=1 / 3,
+                 box_kwargs=None,
+                 segm_proba=1 / 3,
+                 segm_kwargs=None,
+                 squares_proba=0,
+                 squares_kwargs=None,
+                 superres_proba=0,
+                 superres_kwargs=None,
+                 outpainting_proba=0,
+                 outpainting_kwargs=None,
+                 invert_proba=0):
+        self.probas = []
+        self.gens = []
+
+        if irregular_proba > 0:
+            self.probas.append(irregular_proba)
+            if irregular_kwargs is None:
+                irregular_kwargs = {}
+            else:
+                irregular_kwargs = dict(irregular_kwargs)
+            irregular_kwargs['draw_method'] = DrawMethod.LINE
+            self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs))
+
+        if box_proba > 0:
+            self.probas.append(box_proba)
+            if box_kwargs is None:
+                box_kwargs = {}
+            self.gens.append(RandomRectangleMaskGenerator(**box_kwargs))
+
+        if squares_proba > 0:
+            self.probas.append(squares_proba)
+            if squares_kwargs is None:
+                squares_kwargs = {}
+            else:
+                squares_kwargs = dict(squares_kwargs)
+            squares_kwargs['draw_method'] = DrawMethod.SQUARE
+            self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs))
+
+        if superres_proba > 0:
+            self.probas.append(superres_proba)
+            if superres_kwargs is None:
+                superres_kwargs = {}
+            self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs))
+
+        self.probas = np.array(self.probas, dtype='float32')
+        self.probas /= self.probas.sum()
+        self.invert_proba = invert_proba
+
+    def __call__(self, img, iter_i=None, raw_image=None):
+        kind = np.random.choice(len(self.probas), p=self.probas)
+        gen = self.gens[kind]
+        result = gen(img, iter_i=iter_i, raw_image=raw_image)
+        if self.invert_proba > 0 and random.random() < self.invert_proba:
+            result = 1 - result
+        return result
+
+
+def get_transforms(test_mode, out_size):
+    if not test_mode:
+        transform = A.Compose([
+            IAAPerspective2(scale=(0.0, 0.06)),
+            IAAAffine2(scale=(0.7, 1.3), rotate=(-40, 40), shear=(-0.1, 0.1)),
+            A.PadIfNeeded(min_height=out_size, min_width=out_size),
+            A.OpticalDistortion(),
+            A.RandomCrop(height=out_size, width=out_size),
+            A.HorizontalFlip(),
+            A.CLAHE(),
+            A.RandomBrightnessContrast(
+                brightness_limit=0.2, contrast_limit=0.2),
+            A.HueSaturationValue(
+                hue_shift_limit=5, sat_shift_limit=30, val_shift_limit=5),
+            A.ToFloat()
+        ])
+    else:
+        transform = A.Compose([
+            A.PadIfNeeded(min_height=out_size, min_width=out_size),
+            A.CenterCrop(height=out_size, width=out_size),
+            A.ToFloat()
+        ])
+    return transform
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_inpainting, module_name=Models.image_inpainting)
+class ImageInpaintingDataset(TorchTaskDataset):
+
+    def __init__(self, **kwargs):
+        split_config = kwargs['split_config']
+        LOGGER.info(kwargs)
+        mode = kwargs.get('test_mode', False)
+
+        self.data_root = next(iter(split_config.values()))
+        if not osp.exists(self.data_root):
+            self.data_root = osp.dirname(self.data_root)
+            assert osp.exists(self.data_root)
+        mask_gen_kwargs = kwargs.get('mask_gen_kwargs', {})
+        out_size = kwargs.get('out_size', 256)
+        self.mask_generator = MixedMaskGenerator(**mask_gen_kwargs)
+        self.transform = get_transforms(mode, out_size)
+        self.in_files = sorted(
+            list(
+                glob.glob(
+                    osp.join(self.data_root, '**', '*.jpg'), recursive=True))
+            + list(
+                glob.glob(
+                    osp.join(self.data_root, '**', '*.png'), recursive=True)))
+        self.iter_i = 0
+
+    def __len__(self):
+        return len(self.in_files)
+
+    def __getitem__(self, index):
+        path = self.in_files[index]
+        img = cv2.imread(path)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = self.transform(image=img)['image']
+        img = np.transpose(img, (2, 0, 1))
+        # TODO: maybe generate mask before augmentations? slower, but better for segmentation-based masks
+        mask = self.mask_generator(img, iter_i=self.iter_i)
+        self.iter_i += 1
+        return dict(image=img, mask=mask)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 07a14191..dd59d6fb 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -177,6 +177,7 @@ TASK_OUTPUTS = {
     Tasks.image_denoising: [OutputKeys.OUTPUT_IMG],
     Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG],
     Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG],
+    Tasks.image_inpainting: [OutputKeys.OUTPUT_IMG],
 
     # image generation task result for a single image
     # {"output_img": np.array with shape (h, w, 3)}
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index c9a70d14..b18d4465 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -181,6 +181,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_resnet50-bert_video-scene-segmentation_movienet'),
     Tasks.shop_segmentation: (Pipelines.shop_segmentation,
                               'damo/cv_vitb16_segmentation_shop-seg'),
+    Tasks.image_inpainting: (Pipelines.image_inpainting,
+                             'damo/cv_fft_inpainting_lama'),
     Tasks.video_inpainting: (Pipelines.video_inpainting,
                              'damo/cv_video-inpainting'),
     Tasks.hand_static: (Pipelines.hand_static,
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 55bad09a..118eaf17 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -35,6 +35,7 @@ if TYPE_CHECKING:
     from .image_super_resolution_pipeline import ImageSuperResolutionPipeline
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
+    from .image_inpainting_pipeline import ImageInpaintingPipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
     from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline
     from .live_category_pipeline import LiveCategoryPipeline
@@ -99,6 +100,7 @@ else:
         'live_category_pipeline': ['LiveCategoryPipeline'],
         'image_to_image_generation_pipeline':
         ['Image2ImageGenerationPipeline'],
+        'image_inpainting_pipeline': ['ImageInpaintingPipeline'],
         'ocr_detection_pipeline': ['OCRDetectionPipeline'],
         'ocr_recognition_pipeline': ['OCRRecognitionPipeline'],
         'skin_retouching_pipeline': ['SkinRetouchingPipeline'],
diff --git a/modelscope/pipelines/cv/image_inpainting_pipeline.py b/modelscope/pipelines/cv/image_inpainting_pipeline.py
new file mode 100644
index 00000000..6ae0d63e
--- /dev/null
+++ b/modelscope/pipelines/cv/image_inpainting_pipeline.py
@@ -0,0 +1,146 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+from torch.utils.data._utils.collate import default_collate
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_inpainting import FFTInpainting
+from modelscope.models.cv.image_inpainting.refinement import refine_predict
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.image import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_inpainting, module_name=Pipelines.image_inpainting)
+class ImageInpaintingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: str,
+                 pad_out_to_modulo=8,
+                 refine=False,
+                 **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        assert isinstance(model, str), 'model must be a single str'
+        super().__init__(model=model, auto_collate=False, **kwargs)
+        self.refine = refine
+        logger.info(f'loading model from dir {model}')
+        self.infer_model = FFTInpainting(model, predict_only=True)
+        if not self.refine:
+            self.infer_model.to(self.device)
+        self.infer_model.eval()
+        logger.info(f'loading model done, refinement is set to {self.refine}')
+        self.pad_out_to_modulo = pad_out_to_modulo
+
+    def move_to_device(self, obj, device):
+        if isinstance(obj, nn.Module):
+            return obj.to(device)
+        if torch.is_tensor(obj):
+            return obj.to(device)
+        if isinstance(obj, (tuple, list)):
+            return [self.move_to_device(el, device) for el in obj]
+        if isinstance(obj, dict):
+            return {
+                name: self.move_to_device(val, device)
+                for name, val in obj.items()
+            }
+        raise ValueError(f'Unexpected type {type(obj)}')
+
+    def transforms(self, img):
+        if img.ndim == 3:
+            img = np.transpose(img, (2, 0, 1))
+        out_img = img.astype('float32') / 255
+        return out_img
+
+    def ceil_modulo(self, x, mod):
+        if x % mod == 0:
+            return x
+        return (x // mod + 1) * mod
+
+    def pad_img_to_modulo(self, img, mod):
+        channels, height, width = img.shape
+        out_height = self.ceil_modulo(height, mod)
+        out_width = self.ceil_modulo(width, mod)
+        return np.pad(
+            img, ((0, 0), (0, out_height - height), (0, out_width - width)),
+            mode='symmetric')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            image_name, mask_name = input.split('+')
+            img = LoadImage.convert_to_ndarray(image_name)
+            img = self.transforms(img)
+            mask = np.array(LoadImage(mode='L')(mask_name)['img'])
+            mask = self.transforms(mask)
+        elif isinstance(input, PIL.Image.Image):
+            img = input.crop((0, 0, int(input.width / 2), input.height))
+            img = self.transforms(np.array(img))
+            mask = input.crop((int(input.width / 2), 0, input.width,
+                               input.height)).convert('L')
+            mask = self.transforms(np.array(mask))
+        else:
+            raise TypeError('input should be either str or PIL.Image')
+        result = dict(image=img, mask=mask[None, ...])
+
+        if self.pad_out_to_modulo is not None and self.pad_out_to_modulo > 1:
+            result['unpad_to_size'] = result['image'].shape[1:]
+            result['image'] = self.pad_img_to_modulo(result['image'],
+                                                     self.pad_out_to_modulo)
+            result['mask'] = self.pad_img_to_modulo(result['mask'],
+                                                    self.pad_out_to_modulo)
+
+        # Since Pipeline use default torch.no_grad() for performing forward func.
+        # We conduct inference here in case of doing training for refinement.
+        result = self.perform_inference(result)
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        return {OutputKeys.OUTPUT_IMG: input}
+
+    def perform_inference(self, data):
+        batch = default_collate([data])
+        if self.refine:
+            assert 'unpad_to_size' in batch, 'Unpadded size is required for the refinement'
+            assert 'cuda' in str(self.device), 'GPU is required for refinement'
+            gpu_ids = str(self.device).split(':')[-1]
+            cur_res = refine_predict(
+                batch,
+                self.infer_model,
+                gpu_ids=gpu_ids,
+                modulo=self.pad_out_to_modulo,
+                n_iters=15,
+                lr=0.002,
+                min_side=512,
+                max_scales=3,
+                px_budget=900000)
+            cur_res = cur_res[0].permute(1, 2, 0).detach().cpu().numpy()
+        else:
+            with torch.no_grad():
+                batch = self.move_to_device(batch, self.device)
+                batch['mask'] = (batch['mask'] > 0) * 1
+                batch = self.infer_model(batch)
+                cur_res = batch['inpainted'][0].permute(
+                    1, 2, 0).detach().cpu().numpy()
+                unpad_to_size = batch.get('unpad_to_size', None)
+                if unpad_to_size is not None:
+                    orig_height, orig_width = unpad_to_size
+                    cur_res = cur_res[:orig_height, :orig_width]
+
+        cur_res = np.clip(cur_res * 255, 0, 255).astype('uint8')
+        cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR)
+        return cur_res
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index a632642a..86917261 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
     from .builder import build_trainer
     from .cv import (ImageInstanceSegmentationTrainer,
                      ImagePortraitEnhancementTrainer,
-                     MovieSceneSegmentationTrainer)
+                     MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
@@ -22,7 +22,8 @@ else:
         'builder': ['build_trainer'],
         'cv': [
             'ImageInstanceSegmentationTrainer',
-            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer'
+            'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer',
+            'ImageInpaintingTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index 4c65870e..d09fd75c 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -8,6 +8,7 @@ if TYPE_CHECKING:
         ImageInstanceSegmentationTrainer
     from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
     from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
+    from .image_inpainting_trainer import ImageInpaintingTrainer
 
 else:
     _import_structure = {
@@ -15,7 +16,8 @@ else:
         ['ImageInstanceSegmentationTrainer'],
         'image_portrait_enhancement_trainer':
         ['ImagePortraitEnhancementTrainer'],
-        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer']
+        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'],
+        'image_inpainting_trainer': ['ImageInpaintingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/cv/image_inpainting_trainer.py b/modelscope/trainers/cv/image_inpainting_trainer.py
new file mode 100644
index 00000000..74d1ed9f
--- /dev/null
+++ b/modelscope/trainers/cv/image_inpainting_trainer.py
@@ -0,0 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import time
+from collections.abc import Mapping
+
+from torch import distributed as dist
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.trainer import EpochBasedTrainer
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
+                                       ConfigKeys, Hubs, ModeKeys, ModelFile,
+                                       Tasks, TrainerStages)
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.file_utils import func_receive_dict_inputs
+
+
+@TRAINERS.register_module(module_name=Trainers.image_inpainting)
+class ImageInpaintingTrainer(EpochBasedTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def train(self, *args, **kwargs):
+        super().train(*args, **kwargs)
+
+    def evaluate(self, *args, **kwargs):
+        metric_values = super().evaluate(*args, **kwargs)
+        return metric_values
+
+    def prediction_step(self, model, inputs):
+        pass
+
+    def train_loop(self, data_loader):
+        """ Training loop used by `EpochBasedTrainer.train()`
+        """
+        self.invoke_hook(TrainerStages.before_run)
+        self._epoch = 0
+        self.model.train()
+        for _ in range(self._epoch, self._max_epochs):
+            self.invoke_hook(TrainerStages.before_train_epoch)
+            for i, data_batch in enumerate(data_loader):
+                data_batch = to_device(data_batch, self.device)
+                self.data_batch = data_batch
+                self._inner_iter = i
+                for idx in range(2):
+                    self.invoke_hook(TrainerStages.before_train_iter)
+                    self.train_step(self.model, data_batch, idx)
+                    self.invoke_hook(TrainerStages.after_train_iter)
+                del self.data_batch
+                self._iter += 1
+                self._mode = ModeKeys.TRAIN
+
+                if i + 1 >= self.iters_per_epoch:
+                    break
+
+            self.invoke_hook(TrainerStages.after_train_epoch)
+            self._epoch += 1
+
+        self.invoke_hook(TrainerStages.after_run)
+
+    def train_step(self, model, inputs, idx):
+        """ Perform a training step on a batch of inputs.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`TorchModel`): The model to train.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `torch.Tensor`: The tensor with training loss on this batch.
+        """
+        # EvaluationHook will do evaluate and change mode to val, return to train mode
+        # TODO: find more pretty way to change mode
+        model.train()
+        self._mode = ModeKeys.TRAIN
+        # call model forward but not __call__ to skip postprocess
+        if isinstance(inputs,
+                      Mapping) and not func_receive_dict_inputs(model.forward):
+            train_outputs = model.model._do_step(**inputs, optimizer_idx=idx)
+        else:
+            train_outputs = model.model._do_step(inputs, optimizer_idx=idx)
+
+        if not isinstance(train_outputs, dict):
+            raise TypeError('"model.forward()" must return a dict')
+
+        # add model output info to log
+        if 'log_vars' not in train_outputs:
+            default_keys_pattern = ['loss']
+            match_keys = set([])
+            for key_p in default_keys_pattern:
+                match_keys.update(
+                    [key for key in train_outputs.keys() if key_p in key])
+
+            log_vars = {}
+            for key in match_keys:
+                value = train_outputs.get(key, None)
+                if value is not None:
+                    if dist.is_available() and dist.is_initialized():
+                        value = value.data.clone()
+                        dist.all_reduce(value.div_(dist.get_world_size()))
+                    log_vars.update({key: value.item()})
+            self.log_buffer.update(log_vars)
+        else:
+            self.log_buffer.update(train_outputs['log_vars'])
+
+        self.train_outputs = train_outputs
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2331dc85..2a5ac694 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -47,6 +47,8 @@ class CVTasks(object):
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
 
+    crowd_counting = 'crowd-counting'
+
     # image editing
     skin_retouching = 'skin-retouching'
     image_super_resolution = 'image-super-resolution'
@@ -54,6 +56,7 @@ class CVTasks(object):
     image_color_enhancement = 'image-color-enhancement'
     image_denoising = 'image-denoising'
     image_portrait_enhancement = 'image-portrait-enhancement'
+    image_inpainting = 'image-inpainting'
 
     # image generation
     image_to_image_translation = 'image-to-image-translation'
@@ -72,7 +75,6 @@ class CVTasks(object):
     video_category = 'video-category'
     video_embedding = 'video-embedding'
     virtual_try_on = 'virtual-try-on'
-    crowd_counting = 'crowd-counting'
     movie_scene_segmentation = 'movie-scene-segmentation'
 
     # video editing
diff --git a/requirements/cv.txt b/requirements/cv.txt
index f907256d..e6ffb5ff 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -7,6 +7,8 @@ ffmpeg-python>=0.2.0
 ftfy
 imageio>=2.9.0
 imageio-ffmpeg>=0.4.2
+imgaug>=0.4.0
+kornia>=0.5.0
 lmdb
 lpips
 ml_collections
diff --git a/tests/pipelines/test_image_inpainting.py b/tests/pipelines/test_image_inpainting.py
new file mode 100644
index 00000000..b89ce399
--- /dev/null
+++ b/tests/pipelines/test_image_inpainting.py
@@ -0,0 +1,77 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import torch
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageInpaintingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.input_location = 'data/test/images/image_inpainting/image_inpainting.png'
+        self.input_mask_location = 'data/test/images/image_inpainting/image_inpainting_mask.png'
+        self.model_id = 'damo/cv_fft_inpainting_lama'
+
+    def save_result(self, result):
+        vis_img = result[OutputKeys.OUTPUT_IMG]
+        cv2.imwrite('result.png', vis_img)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_inpainting(self):
+        inpainting = pipeline(Tasks.image_inpainting, model=self.model_id)
+        result = inpainting(self.input_location + '+'
+                            + self.input_mask_location)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+    def test_inpainting_with_refinement(self):
+        # if input image is HR, set refine=True is more better
+        inpainting = pipeline(
+            Tasks.image_inpainting, model=self.model_id, refine=True)
+        result = inpainting(self.input_location + '+'
+                            + self.input_mask_location)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_inpainting_with_image(self):
+        inpainting = pipeline(Tasks.image_inpainting, model=self.model_id)
+        img = Image.open(self.input_location).convert('RGB')
+        mask = Image.open(self.input_mask_location).convert('RGB')
+        img_new = Image.new('RGB', (img.width + mask.width, img.height))
+        img_new.paste(img, (0, 0))
+        img_new.paste(mask, (img.width, 0))
+        result = inpainting(img_new)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_inpainting_with_default_task(self):
+        inpainting = pipeline(Tasks.image_inpainting)
+        result = inpainting(self.input_location + '+'
+                            + self.input_mask_location)
+        if result:
+            self.save_result(result)
+        else:
+            raise ValueError('process error')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index 4c571b7f..b4149dc9 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -10,6 +10,7 @@ isolated:  # test cases that may require excessive anmount of GPU memory, which
   - test_easycv_trainer.py
   - test_segformer.py
   - test_segmentation_pipeline.py
+  - test_image_inpainting.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/test_image_inpainting_trainer.py b/tests/trainers/test_image_inpainting_trainer.py
new file mode 100644
index 00000000..807fe64f
--- /dev/null
+++ b/tests/trainers/test_image_inpainting_trainer.py
@@ -0,0 +1,84 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.image_inpainting import FFTInpainting
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageInpaintingTrainerTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/cv_fft_inpainting_lama'
+        self.cache_path = snapshot_download(self.model_id)
+        cfg = Config.from_file(
+            os.path.join(self.cache_path, ModelFile.CONFIGURATION))
+
+        train_data_cfg = ConfigDict(
+            name='PlacesToydataset',
+            split='train',
+            mask_gen_kwargs=cfg.dataset.mask_gen_kwargs,
+            out_size=cfg.dataset.train_out_size,
+            test_mode=False)
+
+        test_data_cfg = ConfigDict(
+            name='PlacesToydataset',
+            split='test',
+            mask_gen_kwargs=cfg.dataset.mask_gen_kwargs,
+            out_size=cfg.dataset.val_out_size,
+            test_mode=True)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            mask_gen_kwargs=train_data_cfg.mask_gen_kwargs,
+            out_size=train_data_cfg.out_size,
+            test_mode=train_data_cfg.test_mode)
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.test_dataset = MsDataset.load(
+            dataset_name=test_data_cfg.name,
+            split=test_data_cfg.split,
+            mask_gen_kwargs=test_data_cfg.mask_gen_kwargs,
+            out_size=test_data_cfg.out_size,
+            test_mode=test_data_cfg.test_mode)
+        assert next(
+            iter(self.test_dataset.config_kwargs['split_config'].values()))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.image_inpainting, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2bfdbbc9d0d77372ccfeb85745c2bcf8c736b534 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:23:36 +0800
Subject: [PATCH 019/129] [to #42322933]update fer to satisfy demo service
 requirements         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10357094

---
 .../models/cv/face_detection/mogface/models/detectors.py   | 2 ++
 .../pipelines/cv/facial_expression_recognition_pipeline.py | 5 ++++-
 modelscope/utils/cv/image_utils.py                         | 7 +------
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/modelscope/models/cv/face_detection/mogface/models/detectors.py b/modelscope/models/cv/face_detection/mogface/models/detectors.py
index 5ae67104..8c1d9150 100644
--- a/modelscope/models/cv/face_detection/mogface/models/detectors.py
+++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py
@@ -1,3 +1,5 @@
+# The implementation is based on MogFace, available at
+# https://github.com/damo-cv/MogFace
 import os
 
 import cv2
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index 1b1f13d1..b598a457 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -45,6 +45,9 @@ class FacialExpressionRecognitionPipeline(Pipeline):
 
         # face detect pipeline
         det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+        self.map_list = [
+            'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
+        ]
         self.face_detection = pipeline(
             Tasks.face_detection, model=det_model_id)
 
@@ -122,7 +125,7 @@ class FacialExpressionRecognitionPipeline(Pipeline):
         labels = result[1].tolist()
         return {
             OutputKeys.SCORES: scores,
-            OutputKeys.LABELS: labels,
+            OutputKeys.LABELS: self.map_list[labels]
         }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 98ba533e..ad0d6c8e 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -113,12 +113,7 @@ def draw_face_detection_no_lm_result(img_path, detection_result):
 
 
 def draw_facial_expression_result(img_path, facial_expression_result):
-    label_idx = facial_expression_result[OutputKeys.LABELS]
-    map_list = [
-        'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
-    ]
-    label = map_list[label_idx]
-
+    label = facial_expression_result[OutputKeys.LABELS]
     img = cv2.imread(img_path)
     assert img is not None, f"Can't read img: {img_path}"
     cv2.putText(

From 0d97f8959d2095ecfd4b43bb4eb607534474a44f Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:24:19 +0800
Subject: [PATCH 020/129] [to #42322933] test: unify kws pipeline input type to
 AUDIO         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10362437

---
 .../test_key_word_spotting_farfield.py        | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index f8c167de..bf61c9e7 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -22,18 +22,14 @@ class KWSFarfieldTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_normal(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
-        inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)}
-        result = kws(inputs)
+        result = kws(os.path.join(os.getcwd(), TEST_SPEECH_FILE))
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_mono(self):
         kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
-        inputs = {
-            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)
-        }
-        result = kws(inputs)
+        result = kws(os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO))
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
@@ -44,17 +40,6 @@ class KWSFarfieldTest(unittest.TestCase):
         self.assertEqual(len(result['kws_list']), 5)
         print(result['kws_list'][-1])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_output(self):
-        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
-        inputs = {
-            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE),
-            'output_file': 'output.wav'
-        }
-        result = kws(inputs)
-        self.assertEqual(len(result['kws_list']), 5)
-        print(result['kws_list'][-1])
-
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_input_bytes(self):
         with open(os.path.join(os.getcwd(), TEST_SPEECH_FILE), 'rb') as f:

From da5d5cd10bf8bb75ff9fde2df3f0112308c35cc7 Mon Sep 17 00:00:00 2001
From: "xixing.tj" <xixing.tj@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:37:57 +0800
Subject: [PATCH 021/129] [to #42322933]add copyright info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加ocr部分代码的copyright信息
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10342392
---
 .../cv/ocr_utils/model_convnext_transformer.py     |  1 +
 .../model_resnet_mutex_v4_linewithchar.py          |  2 ++
 .../pipelines/cv/ocr_utils/ocr_modules/convnext.py | 10 ++--------
 .../cv/ocr_utils/ocr_modules/timm_tinyc.py         |  6 ++----
 .../pipelines/cv/ocr_utils/ocr_modules/vitstr.py   |  9 ++-------
 modelscope/pipelines/cv/ocr_utils/ops.py           |  2 ++
 modelscope/pipelines/cv/ocr_utils/resnet18_v1.py   | 14 ++++++++++++++
 modelscope/pipelines/cv/ocr_utils/resnet_utils.py  | 14 ++++++++++++++
 modelscope/pipelines/cv/ocr_utils/utils.py         |  1 +
 9 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py b/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py
index cf5e2fe1..6ecff7ef 100644
--- a/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
index d03ff405..2c2d5b00 100644
--- a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from SegLink,
+# publicly available at https://github.com/bgshih/seglink
 import tensorflow as tf
 
 from . import ops, resnet18_v1, resnet_utils
diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py
index c2059107..c0e30616 100644
--- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py
+++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py
@@ -1,11 +1,5 @@
-""" Contains various versions of ConvNext Networks.
-ConvNext Networks (ConvNext) were proposed in:
-  Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell and Saining Xie
-  A ConvNet for the 2020s. CVPR 2022.
-Compared to https://github.com/facebookresearch/ConvNeXt,
-we obtain different ConvNext variants by changing the network depth, width,
-feature number, and downsample ratio.
-"""
+# Part of the implementation is borrowed and modified from ConvNext,
+# publicly available at https://github.com/facebookresearch/ConvNeXt
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py
index f54c0e78..555b1e42 100644
--- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py
+++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py
@@ -1,7 +1,5 @@
-'''Referenced from rwightman's pytorch-image-models(timm).
-Github: https://github.com/rwightman/pytorch-image-models
-We use some modules and modify the parameters according to our network.
-'''
+# Part of the implementation is borrowed and modified from timm,
+# publicly available at https://github.com/rwightman/pytorch-image-models
 import collections.abc
 import logging
 import math
diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py
index e7d96574..5ce3aeca 100644
--- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py
+++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py
@@ -1,10 +1,5 @@
-""" Contains various versions of ViTSTR.
-ViTSTR were proposed in:
-  Rowel Atienza
-  Vision transformer for fast and efficient scene text recognition. ICDAR 2021.
-Compared to https://github.com/roatienza/deep-text-recognition-benchmark,
-we obtain different ViTSTR variants by changing the network patch_size and in_chans.
-"""
+# Part of the implementation is borrowed and modified from ViTSTR,
+# publicly available at https://github.com/roatienza/deep-text-recognition-benchmark
 from __future__ import absolute_import, division, print_function
 import logging
 from copy import deepcopy
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
index 09807b10..a36838a6 100644
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -1,3 +1,5 @@
+# Part of the implementation is borrowed and modified from SegLink,
+# publicly available at https://github.com/bgshih/seglink
 import math
 import os
 import shutil
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
index 7930c5a3..85f9faca 100644
--- a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
@@ -1,3 +1,17 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Contains definitions for the original form of Residual Networks.
 The 'v1' residual networks (ResNets) implemented in this module were proposed
 by:
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
index 0a9af224..2ccbd038 100644
--- a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
@@ -1,3 +1,17 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Contains building blocks for various versions of Residual Networks.
 Residual networks (ResNets) were proposed in:
   Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py
index be8e3371..1d0fb297 100644
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import cv2
 import numpy as np
 

From 922f4c589b8da4111e787cd38d0a53518aaa8ada Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Tue, 11 Oct 2022 22:46:30 +0800
Subject: [PATCH 022/129] =?UTF-8?q?[to=20#42322933]=E5=9B=BE=E5=83=8F?=
 =?UTF-8?q?=E5=8E=BB=E5=99=AAusing=20msdataset=20to=20load=20dataset=20=20?=
 =?UTF-8?q?=20=20=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-M?=
 =?UTF-8?q?aaS/MaaS-lib/codereview/10338265?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metrics/image_denoise_metric.py    | 140 +++++++++++++++-
 .../cv/image_denoise/nafnet/NAFNet_arch.py    |   5 +
 .../cv/image_denoise/nafnet/arch_util.py      |   5 +
 .../image_denoise/nafnet_for_image_denoise.py |   1 +
 .../image_denoise_data/data_utils.py          | 152 ------------------
 .../image_denoise_dataset.py                  |  78 ---------
 .../sidd_image_denoising}/__init__.py         |   4 +-
 .../sidd_image_denoising/data_utils.py        |  46 ++++++
 .../sidd_image_denoising_dataset.py           |  62 +++++++
 .../sidd_image_denoising}/transforms.py       |   0
 .../pipelines/cv/image_denoise_pipeline.py    |   2 +-
 tests/pipelines/test_image_denoise.py         |  25 ++-
 tests/trainers/test_image_denoise_trainer.py  |  24 ++-
 13 files changed, 284 insertions(+), 260 deletions(-)
 delete mode 100644 modelscope/msdatasets/image_denoise_data/data_utils.py
 delete mode 100644 modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py
 rename modelscope/msdatasets/{image_denoise_data => task_datasets/sidd_image_denoising}/__init__.py (73%)
 create mode 100644 modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py
 create mode 100644 modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py
 rename modelscope/msdatasets/{image_denoise_data => task_datasets/sidd_image_denoising}/transforms.py (100%)

diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py
index 94ec9dc7..c6df8df1 100644
--- a/modelscope/metrics/image_denoise_metric.py
+++ b/modelscope/metrics/image_denoise_metric.py
@@ -1,7 +1,9 @@
+# The code is modified based on BasicSR metrics:
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
 from typing import Dict
 
+import cv2
 import numpy as np
-from skimage.metrics import peak_signal_noise_ratio, structural_similarity
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.registry import default_group
@@ -34,12 +36,138 @@ class ImageDenoiseMetric(Metric):
     def evaluate(self):
         psnr_list, ssim_list = [], []
         for (pred, label) in zip(self.preds, self.labels):
-            psnr_list.append(
-                peak_signal_noise_ratio(label[0], pred[0], data_range=255))
-            ssim_list.append(
-                structural_similarity(
-                    label[0], pred[0], multichannel=True, data_range=255))
+            psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0))
+            ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0))
         return {
             MetricKeys.PSNR: np.mean(psnr_list),
             MetricKeys.SSIM: np.mean(ssim_list)
         }
+
+
+def reorder_image(img, input_order='HWC'):
+    """Reorder images to 'HWC' order.
+    If the input_order is (h, w), return (h, w, 1);
+    If the input_order is (c, h, w), return (h, w, c);
+    If the input_order is (h, w, c), return as it is.
+    Args:
+        img (ndarray): Input image.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            If the input image shape is (h, w), input_order will not have
+            effects. Default: 'HWC'.
+    Returns:
+        ndarray: reordered image.
+    """
+
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'"
+        )
+    if len(img.shape) == 2:
+        img = img[..., None]
+    if input_order == 'CHW':
+        img = img.transpose(1, 2, 0)
+    return img
+
+
+def calculate_psnr(img, img2, crop_border, input_order='HWC', **kwargs):
+    """Calculate PSNR (Peak Signal-to-Noise Ratio).
+    Reference: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    Args:
+        img (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'.
+    Returns:
+        float: PSNR result.
+    """
+
+    assert img.shape == img2.shape, (
+        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
+        )
+    img = reorder_image(img, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    img = img.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    mse = np.mean((img - img2)**2)
+    if mse == 0:
+        return float('inf')
+    return 10. * np.log10(255. * 255. / mse)
+
+
+def calculate_ssim(img, img2, crop_border, input_order='HWC', **kwargs):
+    """Calculate SSIM (structural similarity).
+    ``Paper: Image quality assessment: From error visibility to structural similarity``
+    The results are the same as that of the official released MATLAB code in
+    https://ece.uwaterloo.ca/~z70wang/research/ssim/.
+    For three-channel images, SSIM is calculated for each channel and then
+    averaged.
+    Args:
+        img (ndarray): Images with range [0, 255].
+        img2 (ndarray): Images with range [0, 255].
+        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+    Returns:
+        float: SSIM result.
+    """
+
+    assert img.shape == img2.shape, (
+        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
+        )
+    img = reorder_image(img, input_order=input_order)
+    img2 = reorder_image(img2, input_order=input_order)
+
+    if crop_border != 0:
+        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
+
+    img = img.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    ssims = []
+    for i in range(img.shape[2]):
+        ssims.append(_ssim(img[..., i], img2[..., i]))
+    return np.array(ssims).mean()
+
+
+def _ssim(img, img2):
+    """Calculate SSIM (structural similarity) for one channel images.
+    It is called by func:`calculate_ssim`.
+    Args:
+        img (ndarray): Images with range [0, 255] with order 'HWC'.
+        img2 (ndarray): Images with range [0, 255] with order 'HWC'.
+    Returns:
+        float: SSIM result.
+    """
+
+    c1 = (0.01 * 255)**2
+    c2 = (0.03 * 255)**2
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+
+    mu1 = cv2.filter2D(img, -1, window)[5:-5,
+                                        5:-5]  # valid mode for window size 11
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+
+    tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
+    tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
+    ssim_map = tmp1 / tmp2
+    return ssim_map.mean()
diff --git a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
index 5b4e8ce1..c4de0729 100644
--- a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
+++ b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py
@@ -1,3 +1,8 @@
+# ------------------------------------------------------------------------
+# Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/modelscope/models/cv/image_denoise/nafnet/arch_util.py b/modelscope/models/cv/image_denoise/nafnet/arch_util.py
index df394dd5..2d406141 100644
--- a/modelscope/models/cv/image_denoise/nafnet/arch_util.py
+++ b/modelscope/models/cv/image_denoise/nafnet/arch_util.py
@@ -1,3 +1,8 @@
+# ------------------------------------------------------------------------
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+# ------------------------------------------------------------------------
+
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
index c484b37b..a6fbf22f 100644
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from copy import deepcopy
 from typing import Any, Dict, Union
diff --git a/modelscope/msdatasets/image_denoise_data/data_utils.py b/modelscope/msdatasets/image_denoise_data/data_utils.py
deleted file mode 100644
index dd735830..00000000
--- a/modelscope/msdatasets/image_denoise_data/data_utils.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# ------------------------------------------------------------------------
-# Modified from BasicSR (https://github.com/xinntao/BasicSR)
-# Copyright 2018-2020 BasicSR Authors
-# ------------------------------------------------------------------------
-import os
-from os import path as osp
-
-import cv2
-import numpy as np
-import torch
-
-from .transforms import mod_crop
-
-
-def img2tensor(imgs, bgr2rgb=True, float32=True):
-    """Numpy array to tensor.
-    Args:
-        imgs (list[ndarray] | ndarray): Input images.
-        bgr2rgb (bool): Whether to change bgr to rgb.
-        float32 (bool): Whether to change to float32.
-    Returns:
-        list[tensor] | tensor: Tensor images. If returned results only have
-            one element, just return tensor.
-    """
-
-    def _totensor(img, bgr2rgb, float32):
-        if img.shape[2] == 3 and bgr2rgb:
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        img = torch.from_numpy(img.transpose(2, 0, 1))
-        if float32:
-            img = img.float()
-        return img
-
-    if isinstance(imgs, list):
-        return [_totensor(img, bgr2rgb, float32) for img in imgs]
-    else:
-        return _totensor(imgs, bgr2rgb, float32)
-
-
-def scandir(dir_path, keyword=None, recursive=False, full_path=False):
-    """Scan a directory to find the interested files.
-    Args:
-        dir_path (str): Path of the directory.
-        keyword (str | tuple(str), optional): File keyword that we are
-            interested in. Default: None.
-        recursive (bool, optional): If set to True, recursively scan the
-            directory. Default: False.
-        full_path (bool, optional): If set to True, include the dir_path.
-            Default: False.
-    Returns:
-        A generator for all the interested files with relative pathes.
-    """
-
-    if (keyword is not None) and not isinstance(keyword, (str, tuple)):
-        raise TypeError('"suffix" must be a string or tuple of strings')
-
-    root = dir_path
-
-    def _scandir(dir_path, keyword, recursive):
-        for entry in os.scandir(dir_path):
-            if not entry.name.startswith('.') and entry.is_file():
-                if full_path:
-                    return_path = entry.path
-                else:
-                    return_path = osp.relpath(entry.path, root)
-
-                if keyword is None:
-                    yield return_path
-                elif keyword in return_path:
-                    yield return_path
-            else:
-                if recursive:
-                    yield from _scandir(
-                        entry.path, keyword=keyword, recursive=recursive)
-                else:
-                    continue
-
-    return _scandir(dir_path, keyword=keyword, recursive=recursive)
-
-
-def padding(img_lq, img_gt, gt_size):
-    h, w, _ = img_lq.shape
-
-    h_pad = max(0, gt_size - h)
-    w_pad = max(0, gt_size - w)
-
-    if h_pad == 0 and w_pad == 0:
-        return img_lq, img_gt
-
-    img_lq = cv2.copyMakeBorder(img_lq, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
-    img_gt = cv2.copyMakeBorder(img_gt, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
-    return img_lq, img_gt
-
-
-def read_img_seq(path, require_mod_crop=False, scale=1):
-    """Read a sequence of images from a given folder path.
-    Args:
-        path (list[str] | str): List of image paths or image folder path.
-        require_mod_crop (bool): Require mod crop for each image.
-            Default: False.
-        scale (int): Scale factor for mod_crop. Default: 1.
-    Returns:
-        Tensor: size (t, c, h, w), RGB, [0, 1].
-    """
-    if isinstance(path, list):
-        img_paths = path
-    else:
-        img_paths = sorted(list(scandir(path, full_path=True)))
-    imgs = [cv2.imread(v).astype(np.float32) / 255. for v in img_paths]
-    if require_mod_crop:
-        imgs = [mod_crop(img, scale) for img in imgs]
-    imgs = img2tensor(imgs, bgr2rgb=True, float32=True)
-    imgs = torch.stack(imgs, dim=0)
-    return imgs
-
-
-def paired_paths_from_folder(folders, keys, filename_tmpl):
-    """Generate paired paths from folders.
-    Args:
-        folders (list[str]): A list of folder path. The order of list should
-            be [input_folder, gt_folder].
-        keys (list[str]): A list of keys identifying folders. The order should
-            be in consistent with folders, e.g., ['lq', 'gt'].
-        filename_tmpl (str): Template for each filename. Note that the
-            template excludes the file extension. Usually the filename_tmpl is
-            for files in the input folder.
-    Returns:
-        list[str]: Returned path list.
-    """
-    assert len(folders) == 2, (
-        'The len of folders should be 2 with [input_folder, gt_folder]. '
-        f'But got {len(folders)}')
-    assert len(keys) == 2, (
-        'The len of keys should be 2 with [input_key, gt_key]. '
-        f'But got {len(keys)}')
-    input_folder, gt_folder = folders
-    input_key, gt_key = keys
-
-    input_paths = list(scandir(input_folder, keyword='NOISY', recursive=True))
-    gt_paths = list(scandir(gt_folder, keyword='GT', recursive=True))
-    assert len(input_paths) == len(gt_paths), (
-        f'{input_key} and {gt_key} datasets have different number of images: '
-        f'{len(input_paths)}, {len(gt_paths)}.')
-    paths = []
-    for idx in range(len(gt_paths)):
-        gt_path = os.path.join(gt_folder, gt_paths[idx])
-        input_path = os.path.join(input_folder, gt_path.replace('GT', 'NOISY'))
-
-        paths.append(
-            dict([(f'{input_key}_path', input_path),
-                  (f'{gt_key}_path', gt_path)]))
-    return paths
diff --git a/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py b/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py
deleted file mode 100644
index 96b777e6..00000000
--- a/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-from typing import Callable, List, Optional, Tuple, Union
-
-import cv2
-import numpy as np
-from torch.utils import data
-
-from .data_utils import img2tensor, padding, paired_paths_from_folder
-from .transforms import augment, paired_random_crop
-
-
-def default_loader(path):
-    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
-
-
-class PairedImageDataset(data.Dataset):
-    """Paired image dataset for image restoration.
-    """
-
-    def __init__(self, opt, root, is_train):
-        super(PairedImageDataset, self).__init__()
-        self.opt = opt
-        self.is_train = is_train
-        self.gt_folder, self.lq_folder = os.path.join(
-            root, opt.dataroot_gt), os.path.join(root, opt.dataroot_lq)
-
-        if opt.filename_tmpl is not None:
-            self.filename_tmpl = opt.filename_tmpl
-        else:
-            self.filename_tmpl = '{}'
-        self.paths = paired_paths_from_folder([self.lq_folder, self.gt_folder],
-                                              ['lq', 'gt'], self.filename_tmpl)
-
-    def __getitem__(self, index):
-        scale = self.opt.scale
-
-        # Load gt and lq images. Dimension order: HWC; channel order: BGR;
-        # image range: [0, 1], float32.
-        gt_path = self.paths[index]['gt_path']
-        img_gt = default_loader(gt_path)
-        lq_path = self.paths[index]['lq_path']
-        img_lq = default_loader(lq_path)
-
-        # augmentation for training
-        # if self.is_train:
-        gt_size = self.opt.gt_size
-        # padding
-        img_gt, img_lq = padding(img_gt, img_lq, gt_size)
-
-        # random crop
-        img_gt, img_lq = paired_random_crop(img_gt, img_lq, gt_size, scale)
-
-        # flip, rotation
-        img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip,
-                                 self.opt.use_rot)
-
-        # BGR to RGB, HWC to CHW, numpy to tensor
-        img_gt, img_lq = img2tensor([img_gt, img_lq],
-                                    bgr2rgb=True,
-                                    float32=True)
-
-        return {
-            'input': img_lq,
-            'target': img_gt,
-            'input_path': lq_path,
-            'target_path': gt_path
-        }
-
-    def __len__(self):
-        return len(self.paths)
-
-    def to_torch_dataset(
-        self,
-        columns: Union[str, List[str]] = None,
-        preprocessors: Union[Callable, List[Callable]] = None,
-        **format_kwargs,
-    ):
-        return self
diff --git a/modelscope/msdatasets/image_denoise_data/__init__.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py
similarity index 73%
rename from modelscope/msdatasets/image_denoise_data/__init__.py
rename to modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py
index ba1d2df8..5376cd7c 100644
--- a/modelscope/msdatasets/image_denoise_data/__init__.py
+++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .image_denoise_dataset import PairedImageDataset
+    from .sidd_image_denoising_dataset import SiddImageDenoisingDataset
 
 else:
     _import_structure = {
-        'image_denoise_dataset': ['PairedImageDataset'],
+        'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
     }
 
     import sys
diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py
new file mode 100644
index 00000000..33fce4c8
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py
@@ -0,0 +1,46 @@
+# ------------------------------------------------------------------------
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+# ------------------------------------------------------------------------
+
+import cv2
+import torch
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    else:
+        return _totensor(imgs, bgr2rgb, float32)
+
+
+def padding(img_lq, img_gt, gt_size):
+    h, w, _ = img_lq.shape
+
+    h_pad = max(0, gt_size - h)
+    w_pad = max(0, gt_size - w)
+
+    if h_pad == 0 and w_pad == 0:
+        return img_lq, img_gt
+
+    img_lq = cv2.copyMakeBorder(img_lq, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
+    img_gt = cv2.copyMakeBorder(img_gt, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT)
+    return img_lq, img_gt
diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py
new file mode 100644
index 00000000..3f0cdae0
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from .data_utils import img2tensor, padding
+from .transforms import augment, paired_random_crop
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_denoising, module_name=Models.nafnet)
+class SiddImageDenoisingDataset(TorchTaskDataset):
+    """Paired image dataset for image restoration.
+    """
+
+    def __init__(self, dataset, opt, is_train):
+        self.dataset = dataset
+        self.opt = opt
+        self.is_train = is_train
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load gt and lq images. Dimension order: HWC; channel order: BGR;
+        # image range: [0, 1], float32.
+        item_dict = self.dataset[index]
+        gt_path = item_dict['Clean Image:FILE']
+        img_gt = default_loader(gt_path)
+        lq_path = item_dict['Noisy Image:FILE']
+        img_lq = default_loader(lq_path)
+
+        # augmentation for training
+        if self.is_train:
+            gt_size = self.opt.gt_size
+            # padding
+            img_gt, img_lq = padding(img_gt, img_lq, gt_size)
+
+            # random crop
+            img_gt, img_lq = paired_random_crop(
+                img_gt, img_lq, gt_size, scale=1)
+
+            # flip, rotation
+            img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip,
+                                     self.opt.use_rot)
+
+        # BGR to RGB, HWC to CHW, numpy to tensor
+        img_gt, img_lq = img2tensor([img_gt, img_lq],
+                                    bgr2rgb=True,
+                                    float32=True)
+
+        return {'input': img_lq, 'target': img_gt}
diff --git a/modelscope/msdatasets/image_denoise_data/transforms.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py
similarity index 100%
rename from modelscope/msdatasets/image_denoise_data/transforms.py
rename to modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py
diff --git a/modelscope/pipelines/cv/image_denoise_pipeline.py b/modelscope/pipelines/cv/image_denoise_pipeline.py
index a11abf36..34ac1e81 100644
--- a/modelscope/pipelines/cv/image_denoise_pipeline.py
+++ b/modelscope/pipelines/cv/image_denoise_pipeline.py
@@ -105,4 +105,4 @@ class ImageDenoisePipeline(Pipeline):
     def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
         output_img = (input['output_tensor'].squeeze(0) * 255).cpu().permute(
             1, 2, 0).numpy().astype('uint8')
-        return {OutputKeys.OUTPUT_IMG: output_img}
+        return {OutputKeys.OUTPUT_IMG: output_img[:, :, ::-1]}
diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py
index bf8cfd0f..d95dd343 100644
--- a/tests/pipelines/test_image_denoise.py
+++ b/tests/pipelines/test_image_denoise.py
@@ -2,8 +2,6 @@
 
 import unittest
 
-from PIL import Image
-
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
@@ -20,16 +18,16 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.image_denoising
         self.model_id = 'damo/cv_nafnet_image-denoise_sidd'
 
-    demo_image_path = 'data/test/images/noisy-demo-1.png'
+    demo_image_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/noisy-demo-0.png'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         pipeline = ImageDenoisePipeline(cache_path)
+        pipeline.group_key = self.task
         denoise_img = pipeline(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -37,9 +35,8 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         model = Model.from_pretrained(self.model_id)
         pipeline_ins = pipeline(task=Tasks.image_denoising, model=model)
         denoise_img = pipeline_ins(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
@@ -47,18 +44,16 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(
             task=Tasks.image_denoising, model=self.model_id)
         denoise_img = pipeline_ins(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.image_denoising)
         denoise_img = pipeline_ins(
-            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]
-        denoise_img = Image.fromarray(denoise_img)
-        w, h = denoise_img.size
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = denoise_img.shape[:2]
         print('pipeline: the shape of output_img is {}x{}'.format(h, w))
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index 261ee4ed..0bcb8930 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -6,10 +6,12 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.cv.image_denoise import NAFNetForImageDenoise
-from modelscope.msdatasets.image_denoise_data import PairedImageDataset
+from modelscope.msdatasets import MsDataset
+from modelscope.msdatasets.task_datasets.sidd_image_denoising import \
+    SiddImageDenoisingDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
 
@@ -28,10 +30,20 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         self.cache_path = snapshot_download(self.model_id)
         self.config = Config.from_file(
             os.path.join(self.cache_path, ModelFile.CONFIGURATION))
-        self.dataset_train = PairedImageDataset(
-            self.config.dataset, self.cache_path, is_train=True)
-        self.dataset_val = PairedImageDataset(
-            self.config.dataset, self.cache_path, is_train=False)
+        dataset_train = MsDataset.load(
+            'SIDD',
+            namespace='huizheng',
+            split='validation',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        dataset_val = MsDataset.load(
+            'SIDD',
+            namespace='huizheng',
+            split='test',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        self.dataset_train = SiddImageDenoisingDataset(
+            dataset_train, self.config.dataset, is_train=True)
+        self.dataset_val = SiddImageDenoisingDataset(
+            dataset_val, self.config.dataset, is_train=False)
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)

From 42be514bac5f985d6d8ce710646e2a79e3d81d39 Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Wed, 12 Oct 2022 15:17:11 +0800
Subject: [PATCH 024/129] [to #42322933]update fer to satisfy demo service
 requirements         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10372291

---
 .../pipelines/cv/facial_expression_recognition_pipeline.py  | 6 +-----
 modelscope/utils/cv/image_utils.py                          | 4 +++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index b598a457..3c85ae62 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -122,11 +122,7 @@ class FacialExpressionRecognitionPipeline(Pipeline):
         result = self.fer(input)
         assert result is not None
         scores = result[0].tolist()
-        labels = result[1].tolist()
-        return {
-            OutputKeys.SCORES: scores,
-            OutputKeys.LABELS: self.map_list[labels]
-        }
+        return {OutputKeys.SCORES: scores, OutputKeys.LABELS: self.map_list}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index ad0d6c8e..eab74688 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -113,7 +113,9 @@ def draw_face_detection_no_lm_result(img_path, detection_result):
 
 
 def draw_facial_expression_result(img_path, facial_expression_result):
-    label = facial_expression_result[OutputKeys.LABELS]
+    scores = facial_expression_result[OutputKeys.SCORES]
+    labels = facial_expression_result[OutputKeys.LABELS]
+    label = labels[np.argmax(scores)]
     img = cv2.imread(img_path)
     assert img is not None, f"Can't read img: {img_path}"
     cv2.putText(

From 71459900544438b3d44bf0e922cdda64ac4d5701 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Wed, 12 Oct 2022 15:18:35 +0800
Subject: [PATCH 025/129] [to #42322933] reivse model problem and remove
 history sql for demo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

相比于master上的tableqa，做出了如下修复：
1. 修复了schema linking中的问题。
2. 同时设置了有history sql和没有history sql的两种输入
3. 增加了sqlite执行逻辑，可以返回sql执行结果
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10365114
---
 .../models/nlp/table_question_answering.py    |  3 +-
 modelscope/outputs.py                         |  1 +
 .../nlp/table_question_answering_pipeline.py  | 61 +++++++++---
 .../preprocessors/star3/fields/database.py    | 53 ++++++++++-
 .../preprocessors/star3/fields/schema_link.py | 33 +++++--
 .../table_question_answering_preprocessor.py  |  5 +-
 modelscope/utils/nlp/nlp_utils.py             | 17 +---
 .../test_table_question_answering.py          | 94 +++++++++++++++----
 8 files changed, 206 insertions(+), 61 deletions(-)

diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
index 3c91a518..c6a03ef3 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -3,9 +3,11 @@
 import os
 from typing import Dict
 
+import json
 import numpy
 import torch
 import torch.nn.functional as F
+import tqdm
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Models
@@ -82,7 +84,6 @@ class TableQuestionAnswering(Model):
 
                 if ntok.startswith('##'):
                     ntok = ntok.replace('##', '')
-
                 tok = nlu1[idx:idx + 1].lower()
                 if ntok == tok:
                     conv_dict[i] = [idx, idx + 1]
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index dd59d6fb..0f353d3d 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -37,6 +37,7 @@ class OutputKeys(object):
     WORD = 'word'
     KWS_LIST = 'kws_list'
     HISTORY = 'history'
+    QUERT_RESULT = 'query_result'
     TIMESTAMPS = 'timestamps'
     SHOT_NUM = 'shot_num'
     SCENE_NUM = 'scene_num'
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 96bfbc34..e1b2b07b 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -2,6 +2,8 @@
 import os
 from typing import Any, Dict, Union
 
+import json
+import torch
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Pipelines
@@ -230,14 +232,16 @@ class TableQuestionAnsweringPipeline(Pipeline):
                 str_sel_list.append(header_name)
                 sql_sel_list.append(header_id)
             else:
-                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
-                                    + header_name + ' )')
-                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( '
-                                    + header_id + ' )')
+                str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
+                                    + header_name + ')')
+                sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '('
+                                    + header_id + ')')
 
         str_cond_list, sql_cond_list = [], []
         for cond in sql['conds']:
             header_name = header_names[cond[0]]
+            if header_name == '空列':
+                continue
             header_id = '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]])
             op = self.cond_ops[cond[1]]
             value = cond[2]
@@ -248,12 +252,17 @@ class TableQuestionAnsweringPipeline(Pipeline):
 
         cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
 
-        final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(str_sel_list),
-                                                    table['table_name'],
-                                                    cond.join(str_cond_list))
-        final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(sql_sel_list),
-                                                      table['table_id'],
-                                                      cond.join(sql_cond_list))
+        if len(str_cond_list) != 0:
+            final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(
+                str_sel_list), table['table_name'], cond.join(str_cond_list))
+            final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(
+                sql_sel_list), table['table_id'], cond.join(sql_cond_list))
+        else:
+            final_str = 'SELECT %s FROM %s' % (', '.join(str_sel_list),
+                                               table['table_name'])
+            final_sql = 'SELECT %s FROM `%s`' % (', '.join(sql_sel_list),
+                                                 table['table_id'])
+
         sql = SQLQuery(
             string=final_str, query=final_sql, sql_result=result['sql'])
 
@@ -274,9 +283,39 @@ class TableQuestionAnsweringPipeline(Pipeline):
             history_sql=history_sql,
             result=result,
             table=self.db.tables[result['table_id']])
+        result['sql']['from'] = [result['table_id']]
         sql = self.sql_dict_to_str(
             result=result, table=self.db.tables[result['table_id']])
-        output = {OutputKeys.OUTPUT: sql, OutputKeys.HISTORY: result['sql']}
+
+        # add sqlite
+        if self.db.is_use_sqlite:
+            try:
+                cursor = self.db.connection_obj.cursor().execute(sql.query)
+                names = [{
+                    'name':
+                    description[0],
+                    'label':
+                    self.db.tables[result['table_id']]['headerid2name'].get(
+                        description[0], description[0])
+                } for description in cursor.description]
+                cells = []
+                for res in cursor.fetchall():
+                    row = {}
+                    for name, cell in zip(names, res):
+                        row[name['name']] = cell
+                    cells.append(row)
+                tabledata = {'headers': names, 'cells': cells}
+            except Exception:
+                tabledata = {'headers': [], 'cells': []}
+        else:
+            tabledata = {'headers': [], 'cells': []}
+
+        output = {
+            OutputKeys.OUTPUT: sql,
+            OutputKeys.HISTORY: result['sql'],
+            OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False),
+        }
+
         return output
 
     def _collate_fn(self, data):
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py
index a99800cf..3d3a1f8d 100644
--- a/modelscope/preprocessors/star3/fields/database.py
+++ b/modelscope/preprocessors/star3/fields/database.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import sqlite3
+
 import json
 import tqdm
 
@@ -7,18 +9,38 @@ from modelscope.preprocessors.star3.fields.struct import Trie
 
 class Database:
 
-    def __init__(self, tokenizer, table_file_path, syn_dict_file_path):
+    def __init__(self,
+                 tokenizer,
+                 table_file_path,
+                 syn_dict_file_path,
+                 is_use_sqlite=False):
         self.tokenizer = tokenizer
+        self.is_use_sqlite = is_use_sqlite
+        if self.is_use_sqlite:
+            self.connection_obj = sqlite3.connect(':memory:')
+            self.type_dict = {'text': 'TEXT', 'number': 'INT', 'date': 'TEXT'}
         self.tables = self.init_tables(table_file_path=table_file_path)
         self.syn_dict = self.init_syn_dict(
             syn_dict_file_path=syn_dict_file_path)
 
+    def __del__(self):
+        if self.is_use_sqlite:
+            self.connection_obj.close()
+
     def init_tables(self, table_file_path):
         tables = {}
         lines = []
-        with open(table_file_path, 'r') as fo:
-            for line in fo:
-                lines.append(line)
+        if type(table_file_path) == str:
+            with open(table_file_path, 'r') as fo:
+                for line in fo:
+                    lines.append(line)
+        elif type(table_file_path) == list:
+            for path in table_file_path:
+                with open(path, 'r') as fo:
+                    for line in fo:
+                        lines.append(line)
+        else:
+            raise ValueError()
 
         for line in tqdm.tqdm(lines, desc='Load Tables'):
             table = json.loads(line.strip())
@@ -34,6 +56,9 @@ class Database:
             headers_tokens.append(empty_column)
             table['tablelen'] = table_header_length
             table['header_tok'] = headers_tokens
+            table['headerid2name'] = {}
+            for hid, hname in zip(table['header_id'], table['header_name']):
+                table['headerid2name'][hid] = hname
 
             table['header_types'].append('null')
             table['header_units'] = [
@@ -51,6 +76,26 @@ class Database:
                     trie_set[ii].insert(word, word)
 
             table['value_trie'] = trie_set
+
+            # create sqlite
+            if self.is_use_sqlite:
+                cursor_obj = self.connection_obj.cursor()
+                cursor_obj.execute('DROP TABLE IF EXISTS %s' %
+                                   (table['table_id']))
+                header_string = ', '.join([
+                    '%s %s' %
+                    (name, self.type_dict[htype]) for name, htype in zip(
+                        table['header_id'], table['header_types'])
+                ])
+                create_table_string = 'CREATE TABLE %s (%s);' % (
+                    table['table_id'], header_string)
+                cursor_obj.execute(create_table_string)
+                for row in table['rows']:
+                    value_string = ', '.join(['"%s"' % (val) for val in row])
+                    insert_row_string = 'INSERT INTO %s VALUES(%s)' % (
+                        table['table_id'], value_string)
+                    cursor_obj.execute(insert_row_string)
+
             tables[table['table_id']] = table
 
         return tables
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py
index 40613f78..7f483a1f 100644
--- a/modelscope/preprocessors/star3/fields/schema_link.py
+++ b/modelscope/preprocessors/star3/fields/schema_link.py
@@ -287,7 +287,13 @@ class SchemaLinker:
 
         return match_len / (len(nlu_t) + 0.1)
 
-    def get_entity_linking(self, tokenizer, nlu, nlu_t, tables, col_syn_dict):
+    def get_entity_linking(self,
+                           tokenizer,
+                           nlu,
+                           nlu_t,
+                           tables,
+                           col_syn_dict,
+                           history_sql=None):
         """
         get linking between question and schema column
         """
@@ -305,8 +311,7 @@ class SchemaLinker:
             typeinfos = []
             for ii, column in enumerate(table['header_name']):
                 column = column.lower()
-                column_new = re.sub('(.*?)', '', column)
-                column_new = re.sub('（.*?）', '', column_new)
+                column_new = column
                 cphrase, cscore = self.get_match_phrase(
                     nlu.lower(), column_new)
                 if cscore > 0.3 and cphrase.strip() != '':
@@ -330,7 +335,6 @@ class SchemaLinker:
                 for cell in ans.keys():
                     vphrase = cell
                     vscore = 1.0
-                    # print("trie_set find:", cell, ans[cell])
                     phrase_tok = tokenizer.tokenize(vphrase)
                     if len(phrase_tok) == 0 or len(vphrase) < 2:
                         continue
@@ -408,16 +412,25 @@ class SchemaLinker:
             match_score = self.get_table_match_score(nlu_t, schema_link)
 
             search_result = {
-                'table_id': table['table_id'],
-                'question_knowledge': final_question,
-                'header_knowledge': final_header,
-                'schema_link': schema_link,
-                'match_score': match_score
+                'table_id':
+                table['table_id'],
+                'question_knowledge':
+                final_question,
+                'header_knowledge':
+                final_header,
+                'schema_link':
+                schema_link,
+                'match_score':
+                match_score,
+                'table_score':
+                int(table['table_id'] == history_sql['from'][0])
+                if history_sql is not None else 0
             }
             search_result_list.append(search_result)
 
         search_result_list = sorted(
-            search_result_list, key=lambda x: x['match_score'],
+            search_result_list,
+            key=lambda x: (x['match_score'], x['table_score']),
             reverse=True)[0:4]
 
         return search_result_list
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
index 163759a1..f98aa6d0 100644
--- a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
@@ -95,7 +95,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
 
         # tokenize question
         question = data['question']
-        history_sql = data['history_sql']
+        history_sql = data.get('history_sql', None)
         nlu = question.lower()
         nlu_t = self.tokenizer.tokenize(nlu)
 
@@ -105,7 +105,8 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
             nlu=nlu,
             nlu_t=nlu_t,
             tables=self.db.tables,
-            col_syn_dict=self.db.syn_dict)
+            col_syn_dict=self.db.syn_dict,
+            history_sql=history_sql)
 
         # collect data
         datas = self.construct_data(
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index eba12103..35b374f2 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -2,8 +2,7 @@ from typing import List
 
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline,
-                                      TableQuestionAnsweringPipeline)
+                                      DialogStateTrackingPipeline)
 
 
 def text2sql_tracking_and_print_results(
@@ -42,17 +41,3 @@ def tracking_and_print_dialog_states(
         print(json.dumps(result))
 
         history_states.extend([result[OutputKeys.OUTPUT], {}])
-
-
-def tableqa_tracking_and_print_results(
-        test_case, pipelines: List[TableQuestionAnsweringPipeline]):
-    for pipeline in pipelines:
-        historical_queries = None
-        for question in test_case['utterance']:
-            output_dict = pipeline({
-                'question': question,
-                'history_sql': historical_queries
-            })
-            print('output_dict', output_dict['output'].string,
-                  output_dict['output'].query)
-            historical_queries = output_dict['history']
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 7ea28725..68e0564f 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import unittest
+from typing import List
 
 from transformers import BertTokenizer
 
@@ -11,10 +12,60 @@ from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
 from modelscope.preprocessors.star3.fields.database import Database
 from modelscope.utils.constant import ModelFile, Tasks
-from modelscope.utils.nlp.nlp_utils import tableqa_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
+def tableqa_tracking_and_print_results_with_history(
+        pipelines: List[TableQuestionAnsweringPipeline]):
+    test_case = {
+        'utterance': [
+            '有哪些风险类型？',
+            '风险类型有多少种？',
+            '珠江流域的小(2)型水库的库容总量是多少？',
+            '那平均值是多少？',
+            '那水库的名称呢？',
+            '换成中型的呢？',
+            '枣庄营业厅的电话',
+            '那地址呢？',
+            '枣庄营业厅的电话和地址',
+        ]
+    }
+    for p in pipelines:
+        historical_queries = None
+        for question in test_case['utterance']:
+            output_dict = p({
+                'question': question,
+                'history_sql': historical_queries
+            })
+            print('question', question)
+            print('sql text:', output_dict['output'].string)
+            print('sql query:', output_dict['output'].query)
+            print('query result:', output_dict['query_result'])
+            print()
+            historical_queries = output_dict['history']
+
+
+def tableqa_tracking_and_print_results_without_history(
+        pipelines: List[TableQuestionAnsweringPipeline]):
+    test_case = {
+        'utterance': [
+            '有哪些风险类型？',
+            '风险类型有多少种？',
+            '珠江流域的小(2)型水库的库容总量是多少？',
+            '枣庄营业厅的电话',
+            '枣庄营业厅的电话和地址',
+        ]
+    }
+    for p in pipelines:
+        for question in test_case['utterance']:
+            output_dict = p({'question': question})
+            print('question', question)
+            print('sql text:', output_dict['output'].string)
+            print('sql query:', output_dict['output'].query)
+            print('query result:', output_dict['query_result'])
+            print()
+
+
 class TableQuestionAnswering(unittest.TestCase):
 
     def setUp(self) -> None:
@@ -22,20 +73,18 @@ class TableQuestionAnswering(unittest.TestCase):
         self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
 
     model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
-    test_case = {
-        'utterance':
-        ['长江流域的小(2)型水库的库容总量是多少？', '那平均值是多少？', '那水库的名称呢？', '换成中型的呢？']
-    }
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         preprocessor = TableQuestionAnsweringPreprocessor(model_dir=cache_path)
         pipelines = [
-            TableQuestionAnsweringPipeline(
-                model=cache_path, preprocessor=preprocessor)
+            pipeline(
+                Tasks.table_question_answering,
+                model=cache_path,
+                preprocessor=preprocessor)
         ]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
@@ -43,15 +92,17 @@ class TableQuestionAnswering(unittest.TestCase):
         preprocessor = TableQuestionAnsweringPreprocessor(
             model_dir=model.model_dir)
         pipelines = [
-            TableQuestionAnsweringPipeline(
-                model=model, preprocessor=preprocessor)
+            pipeline(
+                Tasks.table_question_answering,
+                model=model,
+                preprocessor=preprocessor)
         ]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_task(self):
         pipelines = [pipeline(Tasks.table_question_answering, self.model_id)]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub_with_other_classes(self):
@@ -60,15 +111,24 @@ class TableQuestionAnswering(unittest.TestCase):
             os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
         db = Database(
             tokenizer=self.tokenizer,
-            table_file_path=os.path.join(model.model_dir, 'table.json'),
-            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'))
+            table_file_path=[
+                os.path.join(model.model_dir, 'databases', fname)
+                for fname in os.listdir(
+                    os.path.join(model.model_dir, 'databases'))
+            ],
+            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'),
+            is_use_sqlite=True)
         preprocessor = TableQuestionAnsweringPreprocessor(
             model_dir=model.model_dir, db=db)
         pipelines = [
-            TableQuestionAnsweringPipeline(
-                model=model, preprocessor=preprocessor, db=db)
+            pipeline(
+                Tasks.table_question_answering,
+                model=model,
+                preprocessor=preprocessor,
+                db=db)
         ]
-        tableqa_tracking_and_print_results(self.test_case, pipelines)
+        tableqa_tracking_and_print_results_without_history(pipelines)
+        tableqa_tracking_and_print_results_with_history(pipelines)
 
 
 if __name__ == '__main__':

From 3edf30caa60af9bab70f8aea4217a79581bb473c Mon Sep 17 00:00:00 2001
From: ly261666 <ly261666@alibaba-inc.com>
Date: Wed, 12 Oct 2022 15:19:12 +0800
Subject: [PATCH 026/129] [to #42322933]change the default model of face
 detection after discussion         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371469

---
 modelscope/pipelines/builder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index b18d4465..1f563915 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -118,8 +118,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.hand_2d_keypoints:
     (Pipelines.hand_2d_keypoints,
      'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
-    Tasks.face_detection: (Pipelines.face_detection,
-                           'damo/cv_resnet_facedetection_scrfd10gkps'),
+    Tasks.face_detection:
+    (Pipelines.face_detection,
+     'damo/cv_resnet101_face-detection_cvpr22papermogface'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
     Tasks.facial_expression_recognition:

From a26e6e38697a8795b99de4c7929b415baef78268 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 17:33:03 +0800
Subject: [PATCH 027/129] [to #45071449] fix setup error

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10196007
---
 modelscope/models/audio/tts/models/datasets/__init__.py | 0
 requirements/framework.txt                              | 1 +
 2 files changed, 1 insertion(+)
 mode change 100644 => 100755 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100644
new mode 100755
diff --git a/requirements/framework.txt b/requirements/framework.txt
index b51faeda..aae200da 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -15,6 +15,7 @@ pyyaml
 requests
 scipy
 setuptools
+setuptools_scm
 tensorboard
 tqdm>=4.64.0
 yapf

From 8c91a4972e2ced9f1b40613ee88f4c5197bafa6e Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Wed, 12 Oct 2022 19:01:34 +0800
Subject: [PATCH 028/129] require pai-easycv 0.6.3.7

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10380097
---
 requirements/cv.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cv.txt b/requirements/cv.txt
index e6ffb5ff..eb38beb1 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -17,7 +17,7 @@ mmdet>=2.25.0
 networkx>=2.5
 numba
 onnxruntime>=1.10
-pai-easycv>=0.6.3.6
+pai-easycv>=0.6.3.7
 pandas
 psutil
 regex

From 295fdd1a609def5e0c8b57783f1fca656e4cbcb0 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 12 Oct 2022 19:01:57 +0800
Subject: [PATCH 029/129] [to #45443331]fix: git config email with username bug

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10378571
---
 modelscope/hub/git.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 486f8df3..a149ede1 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -138,8 +138,8 @@ class GitCommandWrapper(metaclass=Singleton):
                 repo_base_dir, repo_name, user_name)
             response = self._run_git_command(*config_user_name_args.split(' '))
             logger.debug(response.stdout.decode('utf8'))
-            config_user_email_args = '-C %s/%s config user.name %s' % (
-                repo_base_dir, repo_name, user_name)
+            config_user_email_args = '-C %s/%s config user.email %s' % (
+                repo_base_dir, repo_name, user_email)
             response = self._run_git_command(
                 *config_user_email_args.split(' '))
             logger.debug(response.stdout.decode('utf8'))

From 4cb5f8a2cd104f89b765d56527d448b2df1be151 Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Wed, 12 Oct 2022 19:53:14 +0800
Subject: [PATCH 030/129] [to #42322933] add human whole body model and image
 object detection auto model         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10319306

---
 data/test/images/auto_demo.jpg                |  3 +
 .../body_keypoints_detection.jpg              |  3 -
 .../keypoints_detect/img_test_wholebody.jpg   |  3 +
 modelscope/metainfo.py                        |  5 ++
 modelscope/models/cv/__init__.py              | 20 +++---
 .../cv/human_wholebody_keypoint/__init__.py   | 22 +++++++
 .../human_wholebody_keypoint.py               | 17 +++++
 .../models/cv/object_detection/__init__.py    |  2 +-
 .../models/cv/object_detection/yolox_pai.py   |  3 +
 .../cv/human_wholebody_keypoint/__init__.py   | 22 +++++++
 .../human_wholebody_keypoint_dataset.py       | 39 +++++++++++
 modelscope/outputs.py                         | 19 +++++-
 modelscope/pipelines/builder.py               |  8 ++-
 modelscope/pipelines/cv/__init__.py           | 11 +++-
 .../cv/body_2d_keypoints_pipeline.py          |  4 +-
 .../cv/body_3d_keypoints_pipeline.py          |  2 +-
 .../pipelines/cv/easycv_pipelines/__init__.py |  5 +-
 .../cv/easycv_pipelines/detection_pipeline.py | 41 +++++++++++-
 .../human_wholebody_keypoint_pipeline.py      | 65 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 modelscope/utils/cv/image_utils.py            | 34 +++++++++-
 .../test_human_wholebody_keypoint.py          | 40 ++++++++++++
 tests/pipelines/test_object_detection.py      | 12 ++++
 23 files changed, 353 insertions(+), 28 deletions(-)
 create mode 100644 data/test/images/auto_demo.jpg
 delete mode 100644 data/test/images/keypoints_detect/body_keypoints_detection.jpg
 create mode 100644 data/test/images/keypoints_detect/img_test_wholebody.jpg
 create mode 100644 modelscope/models/cv/human_wholebody_keypoint/__init__.py
 create mode 100644 modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
 create mode 100644 modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py
 create mode 100644 modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py
 create mode 100644 modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
 create mode 100644 tests/pipelines/test_human_wholebody_keypoint.py

diff --git a/data/test/images/auto_demo.jpg b/data/test/images/auto_demo.jpg
new file mode 100644
index 00000000..30393e53
--- /dev/null
+++ b/data/test/images/auto_demo.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32
+size 53219
diff --git a/data/test/images/keypoints_detect/body_keypoints_detection.jpg b/data/test/images/keypoints_detect/body_keypoints_detection.jpg
deleted file mode 100644
index 71ce7d7e..00000000
--- a/data/test/images/keypoints_detect/body_keypoints_detection.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d
-size 1702339
diff --git a/data/test/images/keypoints_detect/img_test_wholebody.jpg b/data/test/images/keypoints_detect/img_test_wholebody.jpg
new file mode 100644
index 00000000..40a9f3f8
--- /dev/null
+++ b/data/test/images/keypoints_detect/img_test_wholebody.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd
+size 167407
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index cae9d188..759f1688 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -40,6 +40,7 @@ class Models(object):
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
     video_inpainting = 'video-inpainting'
+    human_wholebody_keypoint = 'human-wholebody-keypoint'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
@@ -49,6 +50,7 @@ class Models(object):
     # EasyCV models
     yolox = 'YOLOX'
     segformer = 'Segformer'
+    image_object_detection_auto = 'image-object-detection-auto'
 
     # nlp models
     bert = 'bert'
@@ -170,6 +172,7 @@ class Pipelines(object):
     ocr_recognition = 'convnextTiny-ocr-recognition'
     image_portrait_enhancement = 'gpen-image-portrait-enhancement'
     image_to_image_generation = 'image-to-image-generation'
+    image_object_detection_auto = 'yolox_image-object-detection-auto'
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
     tinynas_detection = 'tinynas-detection'
@@ -185,6 +188,7 @@ class Pipelines(object):
     movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
     shop_segmentation = 'shop-segmentation'
     video_inpainting = 'video-inpainting'
+    human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image'
     pst_action_recognition = 'patchshift-action-recognition'
     hand_static = 'hand-static'
     face_human_hand_detection = 'face-human-hand-detection'
@@ -427,6 +431,7 @@ class Datasets(object):
     """
     ClsDataset = 'ClsDataset'
     Face2dKeypointsDataset = 'Face2dKeypointsDataset'
+    HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index ba7b03c5..fd950f4c 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -4,15 +4,15 @@
 from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
                crowd_counting, face_2d_keypoints, face_detection,
-               face_generation, image_classification, image_color_enhance,
-               image_colorization, image_denoise, image_inpainting,
-               image_instance_segmentation, image_panoptic_segmentation,
-               image_portrait_enhancement, image_reid_person,
-               image_semantic_segmentation, image_to_image_generation,
-               image_to_image_translation, movie_scene_segmentation,
-               object_detection, product_retrieval_embedding,
-               realtime_object_detection, salient_detection, shop_segmentation,
-               super_resolution, video_single_object_tracking,
-               video_summarization, virual_tryon)
+               face_generation, human_wholebody_keypoint, image_classification,
+               image_color_enhance, image_colorization, image_denoise,
+               image_inpainting, image_instance_segmentation,
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_reid_person, image_semantic_segmentation,
+               image_to_image_generation, image_to_image_translation,
+               movie_scene_segmentation, object_detection,
+               product_retrieval_embedding, realtime_object_detection,
+               salient_detection, shop_segmentation, super_resolution,
+               video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/human_wholebody_keypoint/__init__.py b/modelscope/models/cv/human_wholebody_keypoint/__init__.py
new file mode 100644
index 00000000..30e23457
--- /dev/null
+++ b/modelscope/models/cv/human_wholebody_keypoint/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .human_wholebody_keypoint import HumanWholeBodyKeypoint
+
+else:
+    _import_structure = {
+        'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
new file mode 100644
index 00000000..dd3c0290
--- /dev/null
+++ b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
@@ -0,0 +1,17 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.pose.top_down import TopDown
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.human_wholebody_keypoint,
+    module_name=Models.human_wholebody_keypoint)
+class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        TopDown.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py
index 974375ce..0c782d7b 100644
--- a/modelscope/models/cv/object_detection/__init__.py
+++ b/modelscope/models/cv/object_detection/__init__.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
 else:
     _import_structure = {
         'mmdet_model': ['DetectionModel'],
-        'yolox_pai': ['YOLOX']
+        'yolox_pai': ['YOLOX'],
     }
 
     import sys
diff --git a/modelscope/models/cv/object_detection/yolox_pai.py b/modelscope/models/cv/object_detection/yolox_pai.py
index 985cc136..46bd4e3c 100644
--- a/modelscope/models/cv/object_detection/yolox_pai.py
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks
 
 @MODELS.register_module(
     group_key=Tasks.image_object_detection, module_name=Models.yolox)
+@MODELS.register_module(
+    group_key=Tasks.image_object_detection,
+    module_name=Models.image_object_detection_auto)
 class YOLOX(EasyCVBaseModel, _YOLOX):
 
     def __init__(self, model_dir=None, *args, **kwargs):
diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py b/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py
new file mode 100644
index 00000000..472ed2d8
--- /dev/null
+++ b/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .human_wholebody_keypoint_dataset import WholeBodyCocoTopDownDataset
+
+else:
+    _import_structure = {
+        'human_wholebody_keypoint_dataset': ['WholeBodyCocoTopDownDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py b/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py
new file mode 100644
index 00000000..fc9469f2
--- /dev/null
+++ b/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.pose import \
+    WholeBodyCocoTopDownDataset as _WholeBodyCocoTopDownDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.human_wholebody_keypoint,
+    module_name=Datasets.HumanWholeBodyKeypointDataset)
+class WholeBodyCocoTopDownDataset(EasyCVBaseDataset,
+                                  _WholeBodyCocoTopDownDataset):
+    """EasyCV dataset for human whole body 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _WholeBodyCocoTopDownDataset.__init__(self, *args, **kwargs)
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 0f353d3d..ab3ea54a 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -203,7 +203,7 @@ TASK_OUTPUTS = {
 
     # human body keypoints detection result for single sample
     # {
-    #   "poses": [
+    #   "keypoints": [
     #               [[x, y]*15],
     #               [[x, y]*15],
     #               [[x, y]*15]
@@ -220,7 +220,7 @@ TASK_OUTPUTS = {
     #             ]
     # }
     Tasks.body_2d_keypoints:
-    [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
+    [OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES],
 
     # 3D human body keypoints detection result for single sample
     # {
@@ -339,6 +339,21 @@ TASK_OUTPUTS = {
         OutputKeys.SCENE_META_LIST
     ],
 
+    # human whole body keypoints detection result for single sample
+    # {
+    #   "keypoints": [
+    #               [[x, y]*133],
+    #               [[x, y]*133],
+    #               [[x, y]*133]
+    #             ]
+    #   "boxes": [
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #             ]
+    # }
+    Tasks.human_wholebody_keypoint: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
+
     # video summarization result for a single video
     # {
     #        "output":
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 1f563915..bc9073bc 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -75,8 +75,6 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_bart_text-error-correction_chinese'),
     Tasks.image_captioning: (Pipelines.image_captioning,
                              'damo/ofa_image-caption_coco_large_en'),
-    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
-                                 'damo/cv_flow-based-body-reshaping_damo'),
     Tasks.image_portrait_stylization:
     (Pipelines.person_image_cartoon,
      'damo/cv_unet_person-image-cartoon_compound-models'),
@@ -159,6 +157,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.image_classification:
     (Pipelines.daily_image_classification,
      'damo/cv_vit-base_image-classification_Dailylife-labels'),
+    Tasks.image_object_detection:
+    (Pipelines.image_object_detection_auto,
+     'damo/cv_yolox_image-object-detection-auto'),
     Tasks.ocr_recognition:
     (Pipelines.ocr_recognition,
      'damo/cv_convnextTiny_ocr-recognition-general_damo'),
@@ -186,6 +187,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                              'damo/cv_fft_inpainting_lama'),
     Tasks.video_inpainting: (Pipelines.video_inpainting,
                              'damo/cv_video-inpainting'),
+    Tasks.human_wholebody_keypoint:
+    (Pipelines.human_wholebody_keypoint,
+     'damo/cv_hrnetw48_human-wholebody-keypoint_image'),
     Tasks.hand_static: (Pipelines.hand_static,
                         'damo/cv_mobileface_hand-static'),
     Tasks.face_human_hand_detection:
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 118eaf17..f84f5fe5 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -46,7 +46,10 @@ if TYPE_CHECKING:
     from .video_category_pipeline import VideoCategoryPipeline
     from .virtual_try_on_pipeline import VirtualTryonPipeline
     from .shop_segmentation_pipleline import ShopSegmentationPipeline
-    from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline
+    from .easycv_pipelines import (EasyCVDetectionPipeline,
+                                   EasyCVSegmentationPipeline,
+                                   Face2DKeypointsPipeline,
+                                   HumanWholebodyKeypointsPipeline)
     from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline
     from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline
     from .mog_face_detection_pipeline import MogFaceDetectionPipeline
@@ -109,8 +112,10 @@ else:
         'virtual_try_on_pipeline': ['VirtualTryonPipeline'],
         'shop_segmentation_pipleline': ['ShopSegmentationPipeline'],
         'easycv_pipeline': [
-            'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline',
-            'Face2DKeypointsPipeline'
+            'EasyCVDetectionPipeline',
+            'EasyCVSegmentationPipeline',
+            'Face2DKeypointsPipeline',
+            'HumanWholebodyKeypointsPipeline',
         ],
         'text_driven_segmentation_pipeline':
         ['TextDrivenSegmentationPipeline'],
diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
index d6afbae4..bc2e975d 100644
--- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py
@@ -73,7 +73,7 @@ class Body2DKeypointsPipeline(Pipeline):
         if input[0] is None or input[1] is None:
             return {
                 OutputKeys.BOXES: [],
-                OutputKeys.POSES: [],
+                OutputKeys.KEYPOINTS: [],
                 OutputKeys.SCORES: []
             }
 
@@ -83,7 +83,7 @@ class Body2DKeypointsPipeline(Pipeline):
             result_boxes.append([box[0][0], box[0][1], box[1][0], box[1][1]])
         return {
             OutputKeys.BOXES: result_boxes,
-            OutputKeys.POSES: poses,
+            OutputKeys.KEYPOINTS: poses,
             OutputKeys.SCORES: scores
         }
 
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index c3f4e8c1..3502915c 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -145,7 +145,7 @@ class Body3DKeypointsPipeline(Pipeline):
             kps_2d = self.human_body_2d_kps_detector(frame)
             box = kps_2d['boxes'][
                 0]  # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox
-            pose = kps_2d['poses'][0]  # keypoints: [15, 2]
+            pose = kps_2d['keypoints'][0]  # keypoints: [15, 2]
             score = kps_2d['scores'][0]  # keypoints: [15, 2]
             all_2d_poses.append(pose)
             all_boxes_with_socre.append(
diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
index 4f149130..e0209b85 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/__init__.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py
@@ -7,11 +7,14 @@ if TYPE_CHECKING:
     from .detection_pipeline import EasyCVDetectionPipeline
     from .segmentation_pipeline import EasyCVSegmentationPipeline
     from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline
+    from .human_wholebody_keypoint_pipeline import HumanWholebodyKeypointsPipeline
 else:
     _import_structure = {
         'detection_pipeline': ['EasyCVDetectionPipeline'],
         'segmentation_pipeline': ['EasyCVSegmentationPipeline'],
-        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline']
+        'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline'],
+        'human_wholebody_keypoint_pipeline':
+        ['HumanWholebodyKeypointsPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
index 32365102..0c2058d5 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -1,16 +1,28 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any
+
 from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.cv.image_utils import \
+    show_image_object_detection_auto_result
 from .base import EasyCVPipeline
 
 
 @PIPELINES.register_module(
     Tasks.image_object_detection, module_name=Pipelines.easycv_detection)
+@PIPELINES.register_module(
+    Tasks.image_object_detection,
+    module_name=Pipelines.image_object_detection_auto)
 class EasyCVDetectionPipeline(EasyCVPipeline):
     """Pipeline for easycv detection task."""
 
-    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
         """
             model (str): model id on modelscope hub or local model path.
             model_file_pattern (str): model file pattern.
@@ -21,3 +33,28 @@ class EasyCVDetectionPipeline(EasyCVPipeline):
             model_file_pattern=model_file_pattern,
             *args,
             **kwargs)
+
+    def show_result(self, img_path, result, save_path=None):
+        show_image_object_detection_auto_result(img_path, result, save_path)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        scores = []
+        labels = []
+        boxes = []
+        for output in outputs:
+            for score, label, box in zip(output['detection_scores'],
+                                         output['detection_classes'],
+                                         output['detection_boxes']):
+                scores.append(score)
+                labels.append(self.cfg.CLASSES[label])
+                boxes.append([b for b in box])
+
+        results = [{
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: boxes
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
new file mode 100644
index 00000000..263f8225
--- /dev/null
+++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+from typing import Any
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from .base import EasyCVPipeline
+
+
+@PIPELINES.register_module(
+    Tasks.human_wholebody_keypoint,
+    module_name=Pipelines.human_wholebody_keypoint)
+class HumanWholebodyKeypointsPipeline(EasyCVPipeline):
+    """Pipeline for human wholebody 2d keypoints detection."""
+
+    def __init__(self,
+                 model: str,
+                 model_file_pattern=ModelFile.TORCH_MODEL_FILE,
+                 *args,
+                 **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+        self.model_dir = model
+        super(HumanWholebodyKeypointsPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def _build_predict_op(self, **kwargs):
+        """Build EasyCV predictor."""
+        from easycv.predictors.builder import build_predictor
+        detection_predictor_type = self.cfg['DETECTION']['type']
+        detection_model_path = os.path.join(
+            self.model_dir, self.cfg['DETECTION']['model_path'])
+        detection_cfg_file = os.path.join(self.model_dir,
+                                          self.cfg['DETECTION']['config_file'])
+        detection_score_threshold = self.cfg['DETECTION']['score_threshold']
+        self.cfg.pipeline.predictor_config[
+            'detection_predictor_config'] = dict(
+                type=detection_predictor_type,
+                model_path=detection_model_path,
+                config_file=detection_cfg_file,
+                score_threshold=detection_score_threshold)
+        easycv_config = self._to_easycv_config()
+        pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, {
+            'model_path': self.model_path,
+            'config_file': easycv_config,
+            **kwargs
+        })
+        return pipeline_op
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+
+        results = [{
+            OutputKeys.KEYPOINTS: output['keypoints'],
+            OutputKeys.BOXES: output['boxes']
+        } for output in outputs]
+
+        return results
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 2a5ac694..4fa3d766 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -29,6 +29,7 @@ class CVTasks(object):
     body_3d_keypoints = 'body-3d-keypoints'
     hand_2d_keypoints = 'hand-2d-keypoints'
     general_recognition = 'general-recognition'
+    human_wholebody_keypoint = 'human-wholebody-keypoint'
 
     image_classification = 'image-classification'
     image_multilabel_classification = 'image-multilabel-classification'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index eab74688..06a9bbaa 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -80,7 +80,7 @@ def realtime_object_detection_bbox_vis(image, bboxes):
 
 
 def draw_keypoints(output, original_image):
-    poses = np.array(output[OutputKeys.POSES])
+    poses = np.array(output[OutputKeys.KEYPOINTS])
     scores = np.array(output[OutputKeys.SCORES])
     boxes = np.array(output[OutputKeys.BOXES])
     assert len(poses) == len(scores) and len(poses) == len(boxes)
@@ -234,3 +234,35 @@ def show_video_summarization_result(video_in_path, result, video_save_path):
             video_writer.write(frame)
     video_writer.release()
     cap.release()
+
+
+def show_image_object_detection_auto_result(img_path,
+                                            detection_result,
+                                            save_path=None):
+    scores = detection_result[OutputKeys.SCORES]
+    labels = detection_result[OutputKeys.LABELS]
+    bboxes = detection_result[OutputKeys.BOXES]
+    img = cv2.imread(img_path)
+    assert img is not None, f"Can't read img: {img_path}"
+
+    for (score, label, box) in zip(scores, labels, bboxes):
+        cv2.rectangle(img, (int(box[0]), int(box[1])),
+                      (int(box[2]), int(box[3])), (0, 0, 255), 2)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (int(box[0]), int(box[1])),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+        cv2.putText(
+            img,
+            label, (int((box[0] + box[2]) * 0.5), int(box[1])),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+
+    if save_path is not None:
+        cv2.imwrite(save_path, img)
+    return img
diff --git a/tests/pipelines/test_human_wholebody_keypoint.py b/tests/pipelines/test_human_wholebody_keypoint.py
new file mode 100644
index 00000000..b214f4e1
--- /dev/null
+++ b/tests/pipelines/test_human_wholebody_keypoint.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_human_wholebody_keypoint(self):
+        img_path = 'data/test/images/keypoints_detect/img_test_wholebody.jpg'
+        model_id = 'damo/cv_hrnetw48_human-wholebody-keypoint_image'
+
+        human_wholebody_keypoint_pipeline = pipeline(
+            task=Tasks.human_wholebody_keypoint, model=model_id)
+        output = human_wholebody_keypoint_pipeline(img_path)[0]
+
+        output_keypoints = output[OutputKeys.KEYPOINTS]
+        output_pose = output[OutputKeys.BOXES]
+
+        human_wholebody_keypoint_pipeline.predict_op.show_result(
+            img_path,
+            output_keypoints,
+            output_pose,
+            scale=1,
+            save_path='human_wholebody_keypoint_ret.jpg')
+
+        for keypoint in output_keypoints:
+            self.assertEqual(keypoint.shape[0], 133)
+        for box in output_pose:
+            self.assertEqual(box.shape[0], 4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 2a74eb41..2cb217d9 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -59,6 +59,18 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_demo_compatibility(self):
         self.compatibility_check()
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_object_detection_auto_pipeline(self):
+        model_id = 'damo/cv_yolox_image-object-detection-auto'
+        test_image = 'data/test/images/auto_demo.jpg'
+
+        image_object_detection_auto = pipeline(
+            Tasks.image_object_detection, model=model_id)
+
+        result = image_object_detection_auto(test_image)[0]
+        image_object_detection_auto.show_result(test_image, result,
+                                                'auto_demo_ret.jpg')
+
 
 if __name__ == '__main__':
     unittest.main()

From 2989492bc08245ff02a71ac988b175d9e038d807 Mon Sep 17 00:00:00 2001
From: "yuxiang.tyx" <yuxiang.tyx@alibaba-inc.com>
Date: Wed, 12 Oct 2022 19:58:50 +0800
Subject: [PATCH 031/129] =?UTF-8?q?[to=20#42322933]=E6=9B=B4=E6=96=B0face?=
 =?UTF-8?q?=5Fdetection=5Fscrfd=E6=A8=A1=E5=9E=8B=E5=B9=B6=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81finetune,=20=E6=96=B0=E5=A2=9Ecard=5Fdetection?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 调整face_detection的文件层级(scrfd与其余新增face_detection方法平级);
2. 增加极大脸/旋转脸的检测方法，更新了新模型；
3. 支持读入数据集并finetune和eval；
4. 新增card_detection模型，支持读入datasethub数据集并finetune
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244540
---
 data/test/images/card_detection.jpg           |   3 +
 data/test/images/face_detection2.jpeg         |   3 +
 modelscope/metainfo.py                        |   3 +
 .../models/cv/face_detection/__init__.py      |   4 +-
 .../datasets/pipelines/transforms.py          | 189 -----
 .../cv/face_detection/scrfd/__init__.py       |   2 +
 .../{ => scrfd}/mmdet_patch/__init__.py       |   0
 .../{ => scrfd}/mmdet_patch/core/__init__.py  |   0
 .../mmdet_patch/core/bbox/__init__.py         |   0
 .../mmdet_patch/core/bbox/transforms.py       |   4 +-
 .../core/post_processing/__init__.py          |   0
 .../core/post_processing/bbox_nms.py          |   9 +-
 .../mmdet_patch/datasets/__init__.py          |   0
 .../datasets/pipelines/__init__.py            |   8 +-
 .../datasets/pipelines/auto_augment.py        | 271 +++++++
 .../datasets/pipelines/formating.py           | 113 +++
 .../mmdet_patch/datasets/pipelines/loading.py | 225 ++++++
 .../datasets/pipelines/transforms.py          | 737 ++++++++++++++++++
 .../mmdet_patch/datasets/retinaface.py        |   5 +-
 .../mmdet_patch/models/__init__.py            |   0
 .../mmdet_patch/models/backbones/__init__.py  |   0
 .../mmdet_patch/models/backbones/resnet.py    |   0
 .../models/dense_heads/__init__.py            |   0
 .../models/dense_heads/scrfd_head.py          |  11 +-
 .../mmdet_patch/models/detectors/__init__.py  |   0
 .../mmdet_patch/models/detectors/scrfd.py     | 106 ++-
 .../cv/face_detection/scrfd/scrfd_detect.py   |  71 ++
 modelscope/outputs.py                         |  19 +
 modelscope/pipelines/builder.py               |   4 +
 .../pipelines/cv/card_detection_pipeline.py   |  23 +
 .../pipelines/cv/face_detection_pipeline.py   |  39 +-
 .../pipelines/cv/face_recognition_pipeline.py |   2 +-
 .../cv/card_detection_scrfd_trainer.py        |  18 +
 .../cv/face_detection_scrfd_trainer.py        | 154 ++++
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/cv/image_utils.py            |  48 ++
 tests/pipelines/test_card_detection.py        |  66 ++
 tests/pipelines/test_face_detection.py        |  12 +-
 .../test_card_detection_scrfd_trainer.py      | 151 ++++
 .../test_face_detection_scrfd_trainer.py      | 150 ++++
 40 files changed, 2173 insertions(+), 278 deletions(-)
 create mode 100644 data/test/images/card_detection.jpg
 create mode 100644 data/test/images/face_detection2.jpeg
 delete mode 100755 modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
 create mode 100644 modelscope/models/cv/face_detection/scrfd/__init__.py
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/bbox/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/bbox/transforms.py (94%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/post_processing/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/post_processing/bbox_nms.py (89%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/pipelines/__init__.py (53%)
 create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
 create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
 create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
 create mode 100755 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/retinaface.py (97%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/backbones/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/backbones/resnet.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/dense_heads/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/dense_heads/scrfd_head.py (99%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/detectors/__init__.py (100%)
 rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/detectors/scrfd.py (50%)
 create mode 100644 modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
 create mode 100644 modelscope/pipelines/cv/card_detection_pipeline.py
 create mode 100644 modelscope/trainers/cv/card_detection_scrfd_trainer.py
 create mode 100644 modelscope/trainers/cv/face_detection_scrfd_trainer.py
 create mode 100644 tests/pipelines/test_card_detection.py
 create mode 100644 tests/trainers/test_card_detection_scrfd_trainer.py
 create mode 100644 tests/trainers/test_face_detection_scrfd_trainer.py

diff --git a/data/test/images/card_detection.jpg b/data/test/images/card_detection.jpg
new file mode 100644
index 00000000..86728c2c
--- /dev/null
+++ b/data/test/images/card_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c
+size 254845
diff --git a/data/test/images/face_detection2.jpeg b/data/test/images/face_detection2.jpeg
new file mode 100644
index 00000000..7f6025fa
--- /dev/null
+++ b/data/test/images/face_detection2.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae
+size 48909
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 759f1688..0917bf3e 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -148,6 +148,7 @@ class Pipelines(object):
     salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    card_detection = 'resnet-card-detection-scrfd34gkps'
     ulfd_face_detection = 'manual-face-detection-ulfd'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     retina_face_detection = 'resnet50-face-detection-retinaface'
@@ -270,6 +271,8 @@ class Trainers(object):
     image_portrait_enhancement = 'image-portrait-enhancement'
     video_summarization = 'video-summarization'
     movie_scene_segmentation = 'movie-scene-segmentation'
+    face_detection_scrfd = 'face-detection-scrfd'
+    card_detection_scrfd = 'card-detection-scrfd'
     image_inpainting = 'image-inpainting'
 
     # nlp trainers
diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py
index a2a845d2..27d1bd4c 100644
--- a/modelscope/models/cv/face_detection/__init__.py
+++ b/modelscope/models/cv/face_detection/__init__.py
@@ -8,12 +8,14 @@ if TYPE_CHECKING:
     from .mtcnn import MtcnnFaceDetector
     from .retinaface import RetinaFaceDetection
     from .ulfd_slim import UlfdFaceDetector
+    from .scrfd import ScrfdDetect
 else:
     _import_structure = {
         'ulfd_slim': ['UlfdFaceDetector'],
         'retinaface': ['RetinaFaceDetection'],
         'mtcnn': ['MtcnnFaceDetector'],
-        'mogface': ['MogFaceDetector']
+        'mogface': ['MogFaceDetector'],
+        'scrfd': ['ScrfdDetect']
     }
 
     import sys
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
deleted file mode 100755
index 241f2c0e..00000000
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
-https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
-"""
-import numpy as np
-from mmdet.datasets.builder import PIPELINES
-from numpy import random
-
-
-@PIPELINES.register_module()
-class RandomSquareCrop(object):
-    """Random crop the image & bboxes, the cropped patches have minimum IoU
-    requirement with original image & bboxes, the IoU threshold is randomly
-    selected from min_ious.
-
-    Args:
-        min_ious (tuple): minimum IoU threshold for all intersections with
-        bounding boxes
-        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
-        where a >= min_crop_size).
-
-    Note:
-        The keys for bboxes, labels and masks should be paired. That is, \
-        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
-        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
-    """
-
-    def __init__(self,
-                 crop_ratio_range=None,
-                 crop_choice=None,
-                 bbox_clip_border=True):
-
-        self.crop_ratio_range = crop_ratio_range
-        self.crop_choice = crop_choice
-        self.bbox_clip_border = bbox_clip_border
-
-        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
-        if self.crop_ratio_range is not None:
-            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
-
-        self.bbox2label = {
-            'gt_bboxes': 'gt_labels',
-            'gt_bboxes_ignore': 'gt_labels_ignore'
-        }
-        self.bbox2mask = {
-            'gt_bboxes': 'gt_masks',
-            'gt_bboxes_ignore': 'gt_masks_ignore'
-        }
-
-    def __call__(self, results):
-        """Call function to crop images and bounding boxes with minimum IoU
-        constraint.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Result dict with images and bounding boxes cropped, \
-                'img_shape' key is updated.
-        """
-
-        if 'img_fields' in results:
-            assert results['img_fields'] == ['img'], \
-                'Only single img_fields is allowed'
-        img = results['img']
-        assert 'bbox_fields' in results
-        assert 'gt_bboxes' in results
-        boxes = results['gt_bboxes']
-        h, w, c = img.shape
-        scale_retry = 0
-        if self.crop_ratio_range is not None:
-            max_scale = self.crop_ratio_max
-        else:
-            max_scale = np.amax(self.crop_choice)
-        while True:
-            scale_retry += 1
-
-            if scale_retry == 1 or max_scale > 1.0:
-                if self.crop_ratio_range is not None:
-                    scale = np.random.uniform(self.crop_ratio_min,
-                                              self.crop_ratio_max)
-                elif self.crop_choice is not None:
-                    scale = np.random.choice(self.crop_choice)
-            else:
-                scale = scale * 1.2
-
-            for i in range(250):
-                short_side = min(w, h)
-                cw = int(scale * short_side)
-                ch = cw
-
-                # TODO +1
-                if w == cw:
-                    left = 0
-                elif w > cw:
-                    left = random.randint(0, w - cw)
-                else:
-                    left = random.randint(w - cw, 0)
-                if h == ch:
-                    top = 0
-                elif h > ch:
-                    top = random.randint(0, h - ch)
-                else:
-                    top = random.randint(h - ch, 0)
-
-                patch = np.array(
-                    (int(left), int(top), int(left + cw), int(top + ch)),
-                    dtype=np.int)
-
-                # center of boxes should inside the crop img
-                # only adjust boxes and instance masks when the gt is not empty
-                # adjust boxes
-                def is_center_of_bboxes_in_patch(boxes, patch):
-                    # TODO >=
-                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
-                    mask = \
-                        ((center[:, 0] > patch[0])
-                         * (center[:, 1] > patch[1])
-                         * (center[:, 0] < patch[2])
-                         * (center[:, 1] < patch[3]))
-                    return mask
-
-                mask = is_center_of_bboxes_in_patch(boxes, patch)
-                if not mask.any():
-                    continue
-                for key in results.get('bbox_fields', []):
-                    boxes = results[key].copy()
-                    mask = is_center_of_bboxes_in_patch(boxes, patch)
-                    boxes = boxes[mask]
-                    if self.bbox_clip_border:
-                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
-                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
-                    boxes -= np.tile(patch[:2], 2)
-
-                    results[key] = boxes
-                    # labels
-                    label_key = self.bbox2label.get(key)
-                    if label_key in results:
-                        results[label_key] = results[label_key][mask]
-
-                    # keypoints field
-                    if key == 'gt_bboxes':
-                        for kps_key in results.get('keypoints_fields', []):
-                            keypointss = results[kps_key].copy()
-                            keypointss = keypointss[mask, :, :]
-                            if self.bbox_clip_border:
-                                keypointss[:, :, :
-                                           2] = keypointss[:, :, :2].clip(
-                                               max=patch[2:])
-                                keypointss[:, :, :
-                                           2] = keypointss[:, :, :2].clip(
-                                               min=patch[:2])
-                            keypointss[:, :, 0] -= patch[0]
-                            keypointss[:, :, 1] -= patch[1]
-                            results[kps_key] = keypointss
-
-                    # mask fields
-                    mask_key = self.bbox2mask.get(key)
-                    if mask_key in results:
-                        results[mask_key] = results[mask_key][mask.nonzero()
-                                                              [0]].crop(patch)
-
-                # adjust the img no matter whether the gt is empty before crop
-                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
-                patch_from = patch.copy()
-                patch_from[0] = max(0, patch_from[0])
-                patch_from[1] = max(0, patch_from[1])
-                patch_from[2] = min(img.shape[1], patch_from[2])
-                patch_from[3] = min(img.shape[0], patch_from[3])
-                patch_to = patch.copy()
-                patch_to[0] = max(0, patch_to[0] * -1)
-                patch_to[1] = max(0, patch_to[1] * -1)
-                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
-                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
-                rimg[patch_to[1]:patch_to[3],
-                     patch_to[0]:patch_to[2], :] = img[
-                         patch_from[1]:patch_from[3],
-                         patch_from[0]:patch_from[2], :]
-                img = rimg
-                results['img'] = img
-                results['img_shape'] = img.shape
-
-                return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(min_ious={self.min_iou}, '
-        repr_str += f'crop_size={self.crop_size})'
-        return repr_str
diff --git a/modelscope/models/cv/face_detection/scrfd/__init__.py b/modelscope/models/cv/face_detection/scrfd/__init__.py
new file mode 100644
index 00000000..92f81f7a
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .scrfd_detect import ScrfdDetect
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
similarity index 94%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
index d65480eb..75e32d85 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py
@@ -6,7 +6,7 @@ import numpy as np
 import torch
 
 
-def bbox2result(bboxes, labels, num_classes, kps=None):
+def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5):
     """Convert detection results to a list of numpy arrays.
 
     Args:
@@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None):
     Returns:
         list(ndarray): bbox results of each class
     """
-    bbox_len = 5 if kps is None else 5 + 10  # if has kps, add 10 kps into bbox
+    bbox_len = 5 if kps is None else 5 + num_kps * 2  # if has kps, add num_kps*2 into bbox
     if bboxes.shape[0] == 0:
         return [
             np.zeros((0, bbox_len), dtype=np.float32)
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
similarity index 89%
rename from modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
index 7a4f5b3a..697b7338 100644
--- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py
@@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes,
 
     Args:
         multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2)
         multi_scores (Tensor): shape (n, #class), where the last column
             contains scores of the background class, but this will be ignored.
         score_thr (float): bbox threshold, bboxes with scores lower than it
@@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes,
     num_classes = multi_scores.size(1) - 1
     # exclude background category
     kps = None
+    if multi_kps is not None:
+        num_kps = int((multi_kps.shape[1] / num_classes) / 2)
     if multi_bboxes.shape[1] > 4:
         bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
         if multi_kps is not None:
-            kps = multi_kps.view(multi_scores.size(0), -1, 10)
+            kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2)
     else:
         bboxes = multi_bboxes[:, None].expand(
             multi_scores.size(0), num_classes, 4)
         if multi_kps is not None:
             kps = multi_kps[:, None].expand(
-                multi_scores.size(0), num_classes, 10)
+                multi_scores.size(0), num_classes, num_kps * 2)
 
     scores = multi_scores[:, :-1]
     if score_factors is not None:
@@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes,
 
     bboxes = bboxes.reshape(-1, 4)
     if kps is not None:
-        kps = kps.reshape(-1, 10)
+        kps = kps.reshape(-1, num_kps * 2)
     scores = scores.reshape(-1)
     labels = labels.reshape(-1)
 
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py
similarity index 53%
rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py
index 85288910..a2cafd1a 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py
@@ -2,6 +2,12 @@
 The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
 https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
 """
+from .auto_augment import RotateV2
+from .formating import DefaultFormatBundleV2
+from .loading import LoadAnnotationsV2
 from .transforms import RandomSquareCrop
 
-__all__ = ['RandomSquareCrop']
+__all__ = [
+    'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2',
+    'DefaultFormatBundleV2'
+]
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
new file mode 100644
index 00000000..ee60c2e0
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py
@@ -0,0 +1,271 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
+"""
+import copy
+
+import cv2
+import mmcv
+import numpy as np
+from mmdet.datasets.builder import PIPELINES
+
+_MAX_LEVEL = 10
+
+
+def level_to_value(level, max_value):
+    """Map from level to values based on max_value."""
+    return (level / _MAX_LEVEL) * max_value
+
+
+def random_negative(value, random_negative_prob):
+    """Randomly negate value based on random_negative_prob."""
+    return -value if np.random.rand() < random_negative_prob else value
+
+
+def bbox2fields():
+    """The key correspondence from bboxes to labels, masks and
+    segmentations."""
+    bbox2label = {
+        'gt_bboxes': 'gt_labels',
+        'gt_bboxes_ignore': 'gt_labels_ignore'
+    }
+    bbox2mask = {
+        'gt_bboxes': 'gt_masks',
+        'gt_bboxes_ignore': 'gt_masks_ignore'
+    }
+    bbox2seg = {
+        'gt_bboxes': 'gt_semantic_seg',
+    }
+    return bbox2label, bbox2mask, bbox2seg
+
+
+@PIPELINES.register_module()
+class RotateV2(object):
+    """Apply Rotate Transformation to image (and its corresponding bbox, mask,
+    segmentation).
+
+    Args:
+        level (int | float): The level should be in range (0,_MAX_LEVEL].
+        scale (int | float): Isotropic scale factor. Same in
+            ``mmcv.imrotate``.
+        center (int | float | tuple[float]): Center point (w, h) of the
+            rotation in the source image. If None, the center of the
+            image will be used. Same in ``mmcv.imrotate``.
+        img_fill_val (int | float | tuple): The fill value for image border.
+            If float, the same value will be used for all the three
+            channels of image. If tuple, the should be 3 elements (e.g.
+            equals the number of channels for image).
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Default 255.
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1.
+        max_rotate_angle (int | float): The maximum angles for rotate
+            transformation.
+        random_negative_prob (float): The probability that turns the
+             offset negative.
+    """
+
+    def __init__(self,
+                 level,
+                 scale=1,
+                 center=None,
+                 img_fill_val=128,
+                 seg_ignore_label=255,
+                 prob=0.5,
+                 max_rotate_angle=30,
+                 random_negative_prob=0.5):
+        assert isinstance(level, (int, float)), \
+            f'The level must be type int or float. got {type(level)}.'
+        assert 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
+        assert isinstance(scale, (int, float)), \
+            f'The scale must be type int or float. got type {type(scale)}.'
+        if isinstance(center, (int, float)):
+            center = (center, center)
+        elif isinstance(center, tuple):
+            assert len(center) == 2, 'center with type tuple must have '\
+                f'2 elements. got {len(center)} elements.'
+        else:
+            assert center is None, 'center must be None or type int, '\
+                f'float or tuple, got type {type(center)}.'
+        if isinstance(img_fill_val, (float, int)):
+            img_fill_val = tuple([float(img_fill_val)] * 3)
+        elif isinstance(img_fill_val, tuple):
+            assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
+                f'have 3 elements. got {len(img_fill_val)}.'
+            img_fill_val = tuple([float(val) for val in img_fill_val])
+        else:
+            raise ValueError(
+                'img_fill_val must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_fill_val]), \
+            'all elements of img_fill_val should between range [0,255]. '\
+            f'got {img_fill_val}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
+            f'got {prob}.'
+        assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
+            f'should be type int or float. got type {type(max_rotate_angle)}.'
+        self.level = level
+        self.scale = scale
+        # Rotation angle in degrees. Positive values mean
+        # clockwise rotation.
+        self.angle = level_to_value(level, max_rotate_angle)
+        self.center = center
+        self.img_fill_val = img_fill_val
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+        self.max_rotate_angle = max_rotate_angle
+        self.random_negative_prob = random_negative_prob
+
+    def _rotate_img(self, results, angle, center=None, scale=1.0):
+        """Rotate the image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            angle (float): Rotation angle in degrees, positive values
+                mean clockwise rotation. Same in ``mmcv.imrotate``.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation. Same in ``mmcv.imrotate``.
+            scale (int | float): Isotropic scale factor. Same in
+                ``mmcv.imrotate``.
+        """
+        for key in results.get('img_fields', ['img']):
+            img = results[key].copy()
+            img_rotated = mmcv.imrotate(
+                img, angle, center, scale, border_value=self.img_fill_val)
+            results[key] = img_rotated.astype(img.dtype)
+            results['img_shape'] = results[key].shape
+
+    def _rotate_bboxes(self, results, rotate_matrix):
+        """Rotate the bboxes."""
+        h, w, c = results['img_shape']
+        for key in results.get('bbox_fields', []):
+            min_x, min_y, max_x, max_y = np.split(
+                results[key], results[key].shape[-1], axis=-1)
+            coordinates = np.stack([[min_x, min_y], [max_x, min_y],
+                                    [min_x, max_y],
+                                    [max_x, max_y]])  # [4, 2, nb_bbox, 1]
+            # pad 1 to convert from format [x, y] to homogeneous
+            # coordinates format [x, y, 1]
+            coordinates = np.concatenate(
+                (coordinates,
+                 np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
+                axis=1)  # [4, 3, nb_bbox, 1]
+            coordinates = coordinates.transpose(
+                (2, 0, 1, 3))  # [nb_bbox, 4, 3, 1]
+            rotated_coords = np.matmul(rotate_matrix,
+                                       coordinates)  # [nb_bbox, 4, 2, 1]
+            rotated_coords = rotated_coords[..., 0]  # [nb_bbox, 4, 2]
+            min_x, min_y = np.min(
+                rotated_coords[:, :, 0], axis=1), np.min(
+                    rotated_coords[:, :, 1], axis=1)
+            max_x, max_y = np.max(
+                rotated_coords[:, :, 0], axis=1), np.max(
+                    rotated_coords[:, :, 1], axis=1)
+            results[key] = np.stack([min_x, min_y, max_x, max_y],
+                                    axis=-1).astype(results[key].dtype)
+
+    def _rotate_keypoints90(self, results, angle):
+        """Rotate the keypoints, only valid when angle in [-90,90,-180,180]"""
+        if angle not in [-90, 90, 180, -180
+                         ] or self.scale != 1 or self.center is not None:
+            return
+        for key in results.get('keypoints_fields', []):
+            k = results[key]
+            if angle == 90:
+                w, h, c = results['img'].shape
+                new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1)
+            elif angle == -90:
+                w, h, c = results['img'].shape
+                new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1)
+            else:
+                h, w, c = results['img'].shape
+                new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]],
+                               axis=-1)
+            # a kps is invalid if thrid value is -1
+            kps_invalid = new[..., -1][:, -1] == -1
+            new[kps_invalid] = np.zeros(new.shape[1:]) - 1
+            results[key] = new
+
+    def _rotate_masks(self,
+                      results,
+                      angle,
+                      center=None,
+                      scale=1.0,
+                      fill_val=0):
+        """Rotate the masks."""
+        h, w, c = results['img_shape']
+        for key in results.get('mask_fields', []):
+            masks = results[key]
+            results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
+
+    def _rotate_seg(self,
+                    results,
+                    angle,
+                    center=None,
+                    scale=1.0,
+                    fill_val=255):
+        """Rotate the segmentation map."""
+        for key in results.get('seg_fields', []):
+            seg = results[key].copy()
+            results[key] = mmcv.imrotate(
+                seg, angle, center, scale,
+                border_value=fill_val).astype(seg.dtype)
+
+    def _filter_invalid(self, results, min_bbox_size=0):
+        """Filter bboxes and corresponding masks too small after rotate
+        augmentation."""
+        bbox2label, bbox2mask, _ = bbox2fields()
+        for key in results.get('bbox_fields', []):
+            bbox_w = results[key][:, 2] - results[key][:, 0]
+            bbox_h = results[key][:, 3] - results[key][:, 1]
+            valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
+            valid_inds = np.nonzero(valid_inds)[0]
+            results[key] = results[key][valid_inds]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][valid_inds]
+
+    def __call__(self, results):
+        """Call function to rotate images, bounding boxes, masks and semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+        if np.random.rand() > self.prob:
+            return results
+        h, w = results['img'].shape[:2]
+        center = self.center
+        if center is None:
+            center = ((w - 1) * 0.5, (h - 1) * 0.5)
+        angle = random_negative(self.angle, self.random_negative_prob)
+        self._rotate_img(results, angle, center, self.scale)
+        rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
+        self._rotate_bboxes(results, rotate_matrix)
+        self._rotate_keypoints90(results, angle)
+        self._rotate_masks(results, angle, center, self.scale, fill_val=0)
+        self._rotate_seg(
+            results, angle, center, self.scale, fill_val=self.seg_ignore_label)
+        self._filter_invalid(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(level={self.level}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'center={self.center}, '
+        repr_str += f'img_fill_val={self.img_fill_val}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
+        repr_str += f'random_negative_prob={self.random_negative_prob})'
+        return repr_str
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
new file mode 100644
index 00000000..bd2394a8
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py
@@ -0,0 +1,113 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py
+"""
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from mmdet.datasets.builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundleV2(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+
+        if 'img' in results:
+            img = results['img']
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss',
+                'gt_labels'
+        ]:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(
+                mean=np.zeros(num_channels, dtype=np.float32),
+                std=np.ones(num_channels, dtype=np.float32),
+                to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
new file mode 100644
index 00000000..b4c2a385
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py
@@ -0,0 +1,225 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py
+"""
+import os.path as osp
+
+import numpy as np
+import pycocotools.mask as maskUtils
+from mmdet.core import BitmapMasks, PolygonMasks
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadAnnotationsV2(object):
+    """Load mutiple types of annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_keypoints (bool): Whether to parse and load the keypoints annotation.
+            Default: False.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: False.
+        poly2mask (bool): Whether to convert the instance masks from polygons
+            to bitmaps. Default: True.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_keypoints=False,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True,
+                 file_client_args=dict(backend='disk')):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_keypoints = with_keypoints
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_bboxes(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes'].copy()
+
+        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+        if gt_bboxes_ignore is not None:
+            results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+            results['bbox_fields'].append('gt_bboxes_ignore')
+        results['bbox_fields'].append('gt_bboxes')
+        return results
+
+    def _load_keypoints(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_keypointss'] = ann_info['keypointss'].copy()
+
+        results['keypoints_fields'] = ['gt_keypointss']
+        return results
+
+    def _load_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+
+        results['gt_labels'] = results['ann_info']['labels'].copy()
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def process_polygons(self, polygons):
+        """Convert polygons to list of ndarray and filter invalid polygons.
+
+        Args:
+            polygons (list[list]): Polygons of one instance.
+
+        Returns:
+            list[numpy.ndarray]: Processed polygons.
+        """
+
+        polygons = [np.array(p) for p in polygons]
+        valid_polygons = []
+        for polygon in polygons:
+            if len(polygon) % 2 == 0 and len(polygon) >= 6:
+                valid_polygons.append(polygon)
+        return valid_polygons
+
+    def _load_masks(self, results):
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask annotations.
+                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+        """
+
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            gt_masks = PolygonMasks(
+                [self.process_polygons(polygons) for polygons in gt_masks], h,
+                w)
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+
+    def _load_semantic_seg(self, results):
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        import mmcv
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        filename = osp.join(results['seg_prefix'],
+                            results['ann_info']['seg_map'])
+        img_bytes = self.file_client.get(filename)
+        results['gt_semantic_seg'] = mmcv.imfrombytes(
+            img_bytes, flag='unchanged').squeeze()
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_keypoints:
+            results = self._load_keypoints(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_keypoints={self.with_keypoints}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg})'
+        repr_str += f'poly2mask={self.poly2mask})'
+        repr_str += f'poly2mask={self.file_client_args})'
+        return repr_str
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
new file mode 100755
index 00000000..270c34da
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py
@@ -0,0 +1,737 @@
+"""
+The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
+"""
+import mmcv
+import numpy as np
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmdet.datasets.builder import PIPELINES
+from numpy import random
+
+
+@PIPELINES.register_module()
+class ResizeV2(object):
+    """Resize images & bbox & mask &kps.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 override=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and uper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in results.get('img_fields', ['img']):
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+            results[key] = img
+
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+            results['img_shape'] = img.shape
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_keypoints(self, results):
+        """Resize keypoints with ``results['scale_factor']``."""
+        for key in results.get('keypoints_fields', []):
+            keypointss = results[key].copy()
+            factors = results['scale_factor']
+            assert factors[0] == factors[2]
+            assert factors[1] == factors[3]
+            keypointss[:, :, 0] *= factors[0]
+            keypointss[:, :, 1] *= factors[1]
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0,
+                                              img_shape[1])
+                keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0,
+                                              img_shape[0])
+            results[key] = keypointss
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['img_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results['gt_semantic_seg'] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['img'].shape[:2]
+                scale_factor = results['scale_factor']
+                assert isinstance(scale_factor, float)
+                results['scale'] = tuple(
+                    [int(x * scale_factor) for x in img_shape][::-1])
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_keypoints(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio})'
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlipV2(object):
+    """Flip the image & bbox & mask & kps.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+    float/string or tuple of float/string. There are 3 flip modes:
+
+    - ``flip_ratio`` is float, ``direction`` is string: the image will be
+        ``direction``ly flipped with probability of ``flip_ratio`` .
+        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+        then image will be horizontally flipped with probability of 0.5.
+    - ``flip_ratio`` is float, ``direction`` is list of string: the image wil
+        be ``direction[i]``ly flipped with probability of
+        ``flip_ratio/len(direction)``.
+        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+        then image will be horizontally flipped with probability of 0.25,
+        vertically with probability of 0.25.
+    - ``flip_ratio`` is list of float, ``direction`` is list of string:
+        given ``len(flip_ratio) == len(direction)``, the image wil
+        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+        'vertical']``, then image will be horizontally flipped with probability
+         of 0.3, vertically with probability of 0.5
+
+    Args:
+        flip_ratio (float | list[float], optional): The flipping probability.
+            Default: None.
+        direction(str | list[str], optional): The flipping direction. Options
+            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+            If input is a list, the length must equal ``flip_ratio``. Each
+            element in ``flip_ratio`` indicates the flip probability of
+            corresponding direction.
+    """
+
+    def __init__(self, flip_ratio=None, direction='horizontal'):
+        if isinstance(flip_ratio, list):
+            assert mmcv.is_list_of(flip_ratio, float)
+            assert 0 <= sum(flip_ratio) <= 1
+        elif isinstance(flip_ratio, float):
+            assert 0 <= flip_ratio <= 1
+        elif flip_ratio is None:
+            pass
+        else:
+            raise ValueError('flip_ratios must be None, float, '
+                             'or list of float')
+        self.flip_ratio = flip_ratio
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmcv.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError('direction must be either str or list of str')
+        self.direction = direction
+
+        if isinstance(flip_ratio, list):
+            assert len(self.flip_ratio) == len(self.direction)
+        self.count = 0
+
+    def bbox_flip(self, bboxes, img_shape, direction):
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        if direction == 'horizontal':
+            w = img_shape[1]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            h = img_shape[0]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            w = img_shape[1]
+            h = img_shape[0]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(f"Invalid flipping direction '{direction}'")
+        return flipped
+
+    def keypoints_flip(self, keypointss, img_shape, direction):
+        """Flip keypoints horizontally."""
+
+        assert direction == 'horizontal'
+        assert keypointss.shape[-1] == 3
+        num_kps = keypointss.shape[1]
+        assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}'
+        assert keypointss.ndim == 3
+        flipped = keypointss.copy()
+        if num_kps == 5:
+            flip_order = [1, 0, 2, 4, 3]
+        elif num_kps == 4:
+            flip_order = [3, 2, 1, 0]
+        for idx, a in enumerate(flip_order):
+            flipped[:, idx, :] = keypointss[:, a, :]
+        w = img_shape[1]
+        flipped[..., 0] = w - flipped[..., 0]
+        return flipped
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added \
+                into result dict.
+        """
+        if 'flip' not in results:
+            if isinstance(self.direction, list):
+                # None means non-flip
+                direction_list = self.direction + [None]
+            else:
+                # None means non-flip
+                direction_list = [self.direction, None]
+
+            if isinstance(self.flip_ratio, list):
+                non_flip_ratio = 1 - sum(self.flip_ratio)
+                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+            else:
+                non_flip_ratio = 1 - self.flip_ratio
+                # exclude non-flip
+                single_ratio = self.flip_ratio / (len(direction_list) - 1)
+                flip_ratio_list = [single_ratio] * (len(direction_list)
+                                                    - 1) + [non_flip_ratio]
+
+            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+            results['flip'] = cur_dir is not None
+        if 'flip_direction' not in results:
+            results['flip_direction'] = cur_dir
+        if results['flip']:
+            # flip image
+            for key in results.get('img_fields', ['img']):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            # flip kps
+            for key in results.get('keypoints_fields', []):
+                results[key] = self.keypoints_flip(results[key],
+                                                   results['img_shape'],
+                                                   results['flip_direction'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = results[key].flip(results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imflip(
+                    results[key], direction=results['flip_direction'])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class RandomSquareCrop(object):
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self,
+                 crop_ratio_range=None,
+                 crop_choice=None,
+                 bbox_clip_border=True,
+                 big_face_ratio=0,
+                 big_face_crop_choice=None):
+
+        self.crop_ratio_range = crop_ratio_range
+        self.crop_choice = crop_choice
+        self.big_face_crop_choice = big_face_crop_choice
+        self.bbox_clip_border = bbox_clip_border
+
+        assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
+        if self.crop_ratio_range is not None:
+            self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
+
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+        assert big_face_ratio >= 0 and big_face_ratio <= 1.0
+        self.big_face_ratio = big_face_ratio
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        assert 'gt_bboxes' in results
+        # try augment big face images
+        find_bigface = False
+        if np.random.random() < self.big_face_ratio:
+            min_size = 100  # h and w
+            expand_ratio = 0.3  # expand ratio of croped face alongwith both w and h
+            bbox = results['gt_bboxes'].copy()
+            lmks = results['gt_keypointss'].copy()
+            label = results['gt_labels'].copy()
+            # filter small faces
+            size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * (
+                (bbox[:, 3] - bbox[:, 1]) > min_size)
+            bbox = bbox[size_mask]
+            lmks = lmks[size_mask]
+            label = label[size_mask]
+            # randomly choose a face that has no overlap with others
+            if len(bbox) > 0:
+                overlaps = bbox_overlaps(bbox, bbox)
+                overlaps -= np.eye(overlaps.shape[0])
+                iou_mask = np.sum(overlaps, axis=1) == 0
+                bbox = bbox[iou_mask]
+                lmks = lmks[iou_mask]
+                label = label[iou_mask]
+                if len(bbox) > 0:
+                    choice = np.random.randint(len(bbox))
+                    bbox = bbox[choice]
+                    lmks = lmks[choice]
+                    label = [label[choice]]
+                    w = bbox[2] - bbox[0]
+                    h = bbox[3] - bbox[1]
+                    x1 = bbox[0] - w * expand_ratio
+                    x2 = bbox[2] + w * expand_ratio
+                    y1 = bbox[1] - h * expand_ratio
+                    y2 = bbox[3] + h * expand_ratio
+                    x1, x2 = np.clip([x1, x2], 0, img.shape[1])
+                    y1, y2 = np.clip([y1, y2], 0, img.shape[0])
+                    bbox -= np.tile([x1, y1], 2)
+                    lmks -= (x1, y1, 0)
+
+                    find_bigface = True
+                    img = img[int(y1):int(y2), int(x1):int(x2), :]
+                    results['gt_bboxes'] = np.expand_dims(bbox, axis=0)
+                    results['gt_keypointss'] = np.expand_dims(lmks, axis=0)
+                    results['gt_labels'] = np.array(label)
+                    results['img'] = img
+
+        boxes = results['gt_bboxes']
+        h, w, c = img.shape
+
+        if self.crop_ratio_range is not None:
+            max_scale = self.crop_ratio_max
+        else:
+            max_scale = np.amax(self.crop_choice)
+        scale_retry = 0
+        while True:
+            scale_retry += 1
+            if scale_retry == 1 or max_scale > 1.0:
+                if self.crop_ratio_range is not None:
+                    scale = np.random.uniform(self.crop_ratio_min,
+                                              self.crop_ratio_max)
+                elif self.crop_choice is not None:
+                    scale = np.random.choice(self.crop_choice)
+            else:
+                scale = scale * 1.2
+
+            if find_bigface:
+                # select a scale from big_face_crop_choice if in big_face mode
+                scale = np.random.choice(self.big_face_crop_choice)
+
+            for i in range(250):
+                long_side = max(w, h)
+                cw = int(scale * long_side)
+                ch = cw
+
+                # TODO +1
+                if w == cw:
+                    left = 0
+                elif w > cw:
+                    left = random.randint(0, w - cw)
+                else:
+                    left = random.randint(w - cw, 0)
+                if h == ch:
+                    top = 0
+                elif h > ch:
+                    top = random.randint(0, h - ch)
+                else:
+                    top = random.randint(h - ch, 0)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + cw), int(top + ch)),
+                    dtype=np.int32)
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                # adjust boxes
+                def is_center_of_bboxes_in_patch(boxes, patch):
+                    # TODO >=
+                    center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                    mask = \
+                        ((center[:, 0] > patch[0])
+                         * (center[:, 1] > patch[1])
+                         * (center[:, 0] < patch[2])
+                         * (center[:, 1] < patch[3]))
+                    return mask
+
+                mask = is_center_of_bboxes_in_patch(boxes, patch)
+                if not mask.any():
+                    continue
+                for key in results.get('bbox_fields', []):
+                    boxes = results[key].copy()
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    boxes = boxes[mask]
+                    if self.bbox_clip_border:
+                        boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                        boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                    boxes -= np.tile(patch[:2], 2)
+
+                    results[key] = boxes
+                    # labels
+                    label_key = self.bbox2label.get(key)
+                    if label_key in results:
+                        results[label_key] = results[label_key][mask]
+
+                    # keypoints field
+                    if key == 'gt_bboxes':
+                        for kps_key in results.get('keypoints_fields', []):
+                            keypointss = results[kps_key].copy()
+                            keypointss = keypointss[mask, :, :]
+                            if self.bbox_clip_border:
+                                keypointss[:, :, :
+                                           2] = keypointss[:, :, :2].clip(
+                                               max=patch[2:])
+                                keypointss[:, :, :
+                                           2] = keypointss[:, :, :2].clip(
+                                               min=patch[:2])
+                            keypointss[:, :, 0] -= patch[0]
+                            keypointss[:, :, 1] -= patch[1]
+                            results[kps_key] = keypointss
+
+                    # mask fields
+                    mask_key = self.bbox2mask.get(key)
+                    if mask_key in results:
+                        results[mask_key] = results[mask_key][mask.nonzero()
+                                                              [0]].crop(patch)
+
+                # adjust the img no matter whether the gt is empty before crop
+                rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
+                patch_from = patch.copy()
+                patch_from[0] = max(0, patch_from[0])
+                patch_from[1] = max(0, patch_from[1])
+                patch_from[2] = min(img.shape[1], patch_from[2])
+                patch_from[3] = min(img.shape[0], patch_from[3])
+                patch_to = patch.copy()
+                patch_to[0] = max(0, patch_to[0] * -1)
+                patch_to[1] = max(0, patch_to[1] * -1)
+                patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
+                patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
+                rimg[patch_to[1]:patch_to[3],
+                     patch_to[0]:patch_to[2], :] = img[
+                         patch_from[1]:patch_from[3],
+                         patch_from[0]:patch_from[2], :]
+                img = rimg
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_iou}, '
+        repr_str += f'crop_size={self.crop_size})'
+        return repr_str
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
similarity index 97%
rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
index bbacd9be..40c440b9 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py
@@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset):
     CLASSES = ('FG', )
 
     def __init__(self, min_size=None, **kwargs):
-        self.NK = 5
+        self.NK = kwargs.pop('num_kps', 5)
         self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
         self.min_size = min_size
         self.gt_path = kwargs.get('gt_path')
@@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset):
         if len(values) > 4:
             if len(values) > 5:
                 kps = np.array(
-                    values[4:19], dtype=np.float32).reshape((self.NK, 3))
+                    values[4:4 + self.NK * 3], dtype=np.float32).reshape(
+                        (self.NK, 3))
                 for li in range(kps.shape[0]):
                     if (kps[li, :] == -1).all():
                         kps[li][2] = 0.0  # weight = 0, ignore
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
similarity index 99%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
index acc45670..77ec99cf 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py
@@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead):
                  scale_mode=1,
                  dw_conv=False,
                  use_kps=False,
+                 num_kps=5,
                  loss_kps=dict(
                      type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
                  **kwargs):
@@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead):
         self.scale_mode = scale_mode
         self.use_dfl = True
         self.dw_conv = dw_conv
-        self.NK = 5
+        self.NK = num_kps
         self.extra_flops = 0.0
         if loss_dfl is None or not loss_dfl:
             self.use_dfl = False
@@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead):
                 batch_size, -1, self.cls_out_channels).sigmoid()
             bbox_pred = bbox_pred.permute(0, 2, 3,
                                           1).reshape(batch_size, -1, 4)
-            kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10)
-
+            kps_pred = kps_pred.permute(0, 2, 3,
+                                        1).reshape(batch_size, -1, self.NK * 2)
         return cls_score, bbox_pred, kps_pred
 
     def forward_train(self,
@@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead):
                 if self.use_dfl:
                     kps_pred = self.integral(kps_pred) * stride[0]
                 else:
-                    kps_pred = kps_pred.reshape((-1, 10)) * stride[0]
+                    kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0]
 
             nms_pre = cfg.get('nms_pre', -1)
             if nms_pre > 0 and scores.shape[0] > nms_pre:
@@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead):
             mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
             if mlvl_kps is not None:
                 scale_factor2 = torch.tensor(
-                    [scale_factor[0], scale_factor[1]] * 5)
+                    [scale_factor[0], scale_factor[1]] * self.NK)
                 mlvl_kps /= scale_factor2.to(mlvl_kps.device)
 
         mlvl_scores = torch.cat(mlvl_scores)
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py
similarity index 100%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py
diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
similarity index 50%
rename from modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
index a5f5cac2..18b46be1 100755
--- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py
@@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector):
                                               gt_bboxes_ignore)
         return losses
 
-    def simple_test(self, img, img_metas, rescale=False):
+    def simple_test(self,
+                    img,
+                    img_metas,
+                    rescale=False,
+                    repeat_head=1,
+                    output_kps_var=0,
+                    output_results=1):
         """Test function without test time augmentation.
 
         Args:
@@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector):
             img_metas (list[dict]): List of image information.
             rescale (bool, optional): Whether to rescale the results.
                 Defaults to False.
+            repeat_head (int): repeat inference times in head
+            output_kps_var (int): whether output kps var to calculate quality
+            output_results (int): 0: nothing  1: bbox  2: both bbox and kps
 
         Returns:
             list[list[np.ndarray]]: BBox results of each image and classes.
@@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector):
                 corresponds to each class.
         """
         x = self.extract_feat(img)
-        outs = self.bbox_head(x)
-        if torch.onnx.is_in_onnx_export():
-            print('single_stage.py in-onnx-export')
-            print(outs.__class__)
-            cls_score, bbox_pred, kps_pred = outs
-            for c in cls_score:
-                print(c.shape)
-            for c in bbox_pred:
-                print(c.shape)
-            if self.bbox_head.use_kps:
-                for c in kps_pred:
+        assert repeat_head >= 1
+        kps_out0 = []
+        kps_out1 = []
+        kps_out2 = []
+        for i in range(repeat_head):
+            outs = self.bbox_head(x)
+            kps_out0 += [outs[2][0].detach().cpu().numpy()]
+            kps_out1 += [outs[2][1].detach().cpu().numpy()]
+            kps_out2 += [outs[2][2].detach().cpu().numpy()]
+        if output_kps_var:
+            var0 = np.var(np.vstack(kps_out0), axis=0).mean()
+            var1 = np.var(np.vstack(kps_out1), axis=0).mean()
+            var2 = np.var(np.vstack(kps_out2), axis=0).mean()
+            var = np.mean([var0, var1, var2])
+        else:
+            var = None
+
+        if output_results > 0:
+            if torch.onnx.is_in_onnx_export():
+                print('single_stage.py in-onnx-export')
+                print(outs.__class__)
+                cls_score, bbox_pred, kps_pred = outs
+                for c in cls_score:
+                    print(c.shape)
+                for c in bbox_pred:
                     print(c.shape)
-                return (cls_score, bbox_pred, kps_pred)
-            else:
-                return (cls_score, bbox_pred)
-        bbox_list = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
+                if self.bbox_head.use_kps:
+                    for c in kps_pred:
+                        print(c.shape)
+                    return (cls_score, bbox_pred, kps_pred)
+                else:
+                    return (cls_score, bbox_pred)
+            bbox_list = self.bbox_head.get_bboxes(
+                *outs, img_metas, rescale=rescale)
 
-        # return kps if use_kps
-        if len(bbox_list[0]) == 2:
-            bbox_results = [
-                bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
-                for det_bboxes, det_labels in bbox_list
-            ]
-        elif len(bbox_list[0]) == 3:
-            bbox_results = [
-                bbox2result(
-                    det_bboxes,
-                    det_labels,
-                    self.bbox_head.num_classes,
-                    kps=det_kps)
-                for det_bboxes, det_labels, det_kps in bbox_list
-            ]
-        return bbox_results
+            # return kps if use_kps
+            if len(bbox_list[0]) == 2:
+                bbox_results = [
+                    bbox2result(det_bboxes, det_labels,
+                                self.bbox_head.num_classes)
+                    for det_bboxes, det_labels in bbox_list
+                ]
+            elif len(bbox_list[0]) == 3:
+                if output_results == 2:
+                    bbox_results = [
+                        bbox2result(
+                            det_bboxes,
+                            det_labels,
+                            self.bbox_head.num_classes,
+                            kps=det_kps,
+                            num_kps=self.bbox_head.NK)
+                        for det_bboxes, det_labels, det_kps in bbox_list
+                    ]
+                elif output_results == 1:
+                    bbox_results = [
+                        bbox2result(det_bboxes, det_labels,
+                                    self.bbox_head.num_classes)
+                        for det_bboxes, det_labels, _ in bbox_list
+                    ]
+        else:
+            bbox_results = None
+        if var is not None:
+            return bbox_results, var
+        else:
+            return bbox_results
 
     def feature_test(self, img):
         x = self.extract_feat(img)
diff --git a/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
new file mode 100644
index 00000000..59611604
--- /dev/null
+++ b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from copy import deepcopy
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ScrfdDetect']
+
+
+@MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd)
+class ScrfdDetect(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the face detection model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        from mmcv import Config
+        from mmcv.parallel import MMDataParallel
+        from mmcv.runner import load_checkpoint
+        from mmdet.models import build_detector
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
+        cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py'))
+        ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
+        detector = build_detector(cfg.model)
+        logger.info(f'loading model from {ckpt_path}')
+        device = torch.device(
+            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
+        load_checkpoint(detector, ckpt_path, map_location=device)
+        detector = MMDataParallel(detector, device_ids=[0])
+        detector.eval()
+        self.detector = detector
+        logger.info('load model done')
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.detector(
+            return_loss=False,
+            rescale=True,
+            img=[input['img'][0].unsqueeze(0)],
+            img_metas=[[dict(input['img_metas'][0].data)]],
+            output_results=2)
+        assert result is not None
+        result = result[0][0]
+        bboxes = result[:, :4].tolist()
+        kpss = result[:, 5:].tolist()
+        scores = result[:, 4].tolist()
+        return {
+            OutputKeys.SCORES: scores,
+            OutputKeys.BOXES: bboxes,
+            OutputKeys.KEYPOINTS: kpss
+        }
+
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index ab3ea54a..3001c03c 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -90,6 +90,25 @@ TASK_OUTPUTS = {
     Tasks.face_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # card detection result for single sample
+    #   {
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #       "boxes": [
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #           [x1, y1, x2, y2],
+    #       ],
+    #       "keypoints": [
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #           [x1, y1, x2, y2, x3, y3, x4, y4],
+    #       ],
+    #   }
+    Tasks.card_detection:
+    [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
+
     # facial expression recognition result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index bc9073bc..174d10b1 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -116,6 +116,10 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.hand_2d_keypoints:
     (Pipelines.hand_2d_keypoints,
      'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'),
+    Tasks.face_detection: (Pipelines.face_detection,
+                           'damo/cv_resnet_facedetection_scrfd10gkps'),
+    Tasks.card_detection: (Pipelines.card_detection,
+                           'damo/cv_resnet_carddetection_scrfd34gkps'),
     Tasks.face_detection:
     (Pipelines.face_detection,
      'damo/cv_resnet101_face-detection_cvpr22papermogface'),
diff --git a/modelscope/pipelines/cv/card_detection_pipeline.py b/modelscope/pipelines/cv/card_detection_pipeline.py
new file mode 100644
index 00000000..00b18024
--- /dev/null
+++ b/modelscope/pipelines/cv/card_detection_pipeline.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.cv.face_detection_pipeline import \
+    FaceDetectionPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.card_detection, module_name=Pipelines.card_detection)
+class CardDetectionPipeline(FaceDetectionPipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a card detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        thr = 0.45  # card/face detect use different threshold
+        super().__init__(model=model, score_thr=thr, **kwargs)
diff --git a/modelscope/pipelines/cv/face_detection_pipeline.py b/modelscope/pipelines/cv/face_detection_pipeline.py
index eff5b70f..608567a4 100644
--- a/modelscope/pipelines/cv/face_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_detection_pipeline.py
@@ -8,6 +8,7 @@ import PIL
 import torch
 
 from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_detection import ScrfdDetect
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -29,27 +30,8 @@ class FaceDetectionPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        from mmcv import Config
-        from mmcv.parallel import MMDataParallel
-        from mmcv.runner import load_checkpoint
-        from mmdet.models import build_detector
-        from modelscope.models.cv.face_detection.mmdet_patch.datasets import RetinaFaceDataset
-        from modelscope.models.cv.face_detection.mmdet_patch.datasets.pipelines import RandomSquareCrop
-        from modelscope.models.cv.face_detection.mmdet_patch.models.backbones import ResNetV1e
-        from modelscope.models.cv.face_detection.mmdet_patch.models.dense_heads import SCRFDHead
-        from modelscope.models.cv.face_detection.mmdet_patch.models.detectors import SCRFD
-        cfg = Config.fromfile(osp.join(model, 'mmcv_scrfd_10g_bnkps.py'))
-        detector = build_detector(
-            cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
-        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
-        logger.info(f'loading model from {ckpt_path}')
-        device = torch.device(
-            f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
-        load_checkpoint(detector, ckpt_path, map_location=device)
-        detector = MMDataParallel(detector, device_ids=[0])
-        detector.eval()
+        detector = ScrfdDetect(model_dir=model, **kwargs)
         self.detector = detector
-        logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
         img = LoadImage.convert_to_ndarray(input)
@@ -85,22 +67,7 @@ class FaceDetectionPipeline(Pipeline):
         return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-
-        result = self.detector(
-            return_loss=False,
-            rescale=True,
-            img=[input['img'][0].unsqueeze(0)],
-            img_metas=[[dict(input['img_metas'][0].data)]])
-        assert result is not None
-        result = result[0][0]
-        bboxes = result[:, :4].tolist()
-        kpss = result[:, 5:].tolist()
-        scores = result[:, 4].tolist()
-        return {
-            OutputKeys.SCORES: scores,
-            OutputKeys.BOXES: bboxes,
-            OutputKeys.KEYPOINTS: kpss
-        }
+        return self.detector(input)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index 873e4a1f..abae69d4 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -49,7 +49,7 @@ class FaceRecognitionPipeline(Pipeline):
         # face detect pipeline
         det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
         self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id)
+            Tasks.face_detection, model=det_model_id, model_revision='v2')
 
     def _choose_face(self,
                      det_result,
diff --git a/modelscope/trainers/cv/card_detection_scrfd_trainer.py b/modelscope/trainers/cv/card_detection_scrfd_trainer.py
new file mode 100644
index 00000000..e1f81bcf
--- /dev/null
+++ b/modelscope/trainers/cv/card_detection_scrfd_trainer.py
@@ -0,0 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.cv.face_detection_scrfd_trainer import \
+    FaceDetectionScrfdTrainer
+
+
+@TRAINERS.register_module(module_name=Trainers.card_detection_scrfd)
+class CardDetectionScrfdTrainer(FaceDetectionScrfdTrainer):
+
+    def __init__(self, cfg_file: str, *args, **kwargs):
+        """ High-level finetune api for SCRFD.
+
+        Args:
+            cfg_file: Path to configuration file.
+        """
+        # card/face dataset use different img folder names
+        super().__init__(cfg_file, imgdir_name='', **kwargs)
diff --git a/modelscope/trainers/cv/face_detection_scrfd_trainer.py b/modelscope/trainers/cv/face_detection_scrfd_trainer.py
new file mode 100644
index 00000000..9cfae7dd
--- /dev/null
+++ b/modelscope/trainers/cv/face_detection_scrfd_trainer.py
@@ -0,0 +1,154 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import os
+import os.path as osp
+import time
+from typing import Callable, Dict, Optional
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+
+
+@TRAINERS.register_module(module_name=Trainers.face_detection_scrfd)
+class FaceDetectionScrfdTrainer(BaseTrainer):
+
+    def __init__(self,
+                 cfg_file: str,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 *args,
+                 **kwargs):
+        """ High-level finetune api for SCRFD.
+
+        Args:
+            cfg_file: Path to configuration file.
+            cfg_modify_fn: An input fn which is used to modify the cfg read out of the file.
+        """
+        import mmcv
+        from mmcv.runner import get_dist_info, init_dist
+        from mmcv.utils import get_git_hash
+        from mmdet.utils import collect_env, get_root_logger
+        from mmdet.apis import set_random_seed
+        from mmdet.models import build_detector
+        from mmdet.datasets import build_dataset
+        from mmdet import __version__
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import DefaultFormatBundleV2
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import LoadAnnotationsV2
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RotateV2
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
+        from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
+        super().__init__(cfg_file)
+        cfg = self.cfg
+        if 'work_dir' in kwargs:
+            cfg.work_dir = kwargs['work_dir']
+        else:
+            # use config filename as default work_dir if work_dir is None
+            cfg.work_dir = osp.join('./work_dirs',
+                                    osp.splitext(osp.basename(cfg_file))[0])
+        mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+
+        if 'resume_from' in kwargs:  # pretrain model for finetune
+            cfg.resume_from = kwargs['resume_from']
+        cfg.device = 'cuda'
+        if 'gpu_ids' in kwargs:
+            cfg.gpu_ids = kwargs['gpu_ids']
+        else:
+            cfg.gpu_ids = range(1)
+        labelfile_name = kwargs.pop('labelfile_name', 'labelv2.txt')
+        imgdir_name = kwargs.pop('imgdir_name', 'images/')
+        if 'train_root' in kwargs:
+            cfg.data.train.ann_file = kwargs['train_root'] + labelfile_name
+            cfg.data.train.img_prefix = kwargs['train_root'] + imgdir_name
+        if 'val_root' in kwargs:
+            cfg.data.val.ann_file = kwargs['val_root'] + labelfile_name
+            cfg.data.val.img_prefix = kwargs['val_root'] + imgdir_name
+        if 'total_epochs' in kwargs:
+            cfg.total_epochs = kwargs['total_epochs']
+        if cfg_modify_fn is not None:
+            cfg = cfg_modify_fn(cfg)
+        if 'launcher' in kwargs:
+            distributed = True
+            init_dist(kwargs['launcher'], **cfg.dist_params)
+            # re-set gpu_ids with distributed training mode
+            _, world_size = get_dist_info()
+            cfg.gpu_ids = range(world_size)
+        else:
+            distributed = False
+        # no_validate=True will not evaluate checkpoint during training
+        cfg.no_validate = kwargs.get('no_validate', False)
+        # init the logger before other steps
+        timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+        log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+        logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+        # init the meta dict to record some important information such as
+        # environment info and seed, which will be logged
+        meta = dict()
+        # log env info
+        env_info_dict = collect_env()
+        env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+        dash_line = '-' * 60 + '\n'
+        logger.info('Environment info:\n' + dash_line + env_info + '\n'
+                    + dash_line)
+        meta['env_info'] = env_info
+        meta['config'] = cfg.pretty_text
+        # log some basic info
+        logger.info(f'Distributed training: {distributed}')
+        logger.info(f'Config:\n{cfg.pretty_text}')
+
+        # set random seeds
+        if 'seed' in kwargs:
+            cfg.seed = kwargs['seed']
+            _deterministic = kwargs.get('deterministic', False)
+            logger.info(f'Set random seed to {kwargs["seed"]}, '
+                        f'deterministic: {_deterministic}')
+            set_random_seed(kwargs['seed'], deterministic=_deterministic)
+        else:
+            cfg.seed = None
+        meta['seed'] = cfg.seed
+        meta['exp_name'] = osp.basename(cfg_file)
+
+        model = build_detector(cfg.model)
+        model.init_weights()
+        datasets = [build_dataset(cfg.data.train)]
+        if len(cfg.workflow) == 2:
+            val_dataset = copy.deepcopy(cfg.data.val)
+            val_dataset.pipeline = cfg.data.train.pipeline
+            datasets.append(build_dataset(val_dataset))
+        if cfg.checkpoint_config is not None:
+            # save mmdet version, config file content and class names in
+            # checkpoints as meta data
+            cfg.checkpoint_config.meta = dict(
+                mmdet_version=__version__ + get_git_hash()[:7],
+                CLASSES=datasets[0].CLASSES)
+        # add an attribute for visualization convenience
+        model.CLASSES = datasets[0].CLASSES
+
+        self.cfg = cfg
+        self.datasets = datasets
+        self.model = model
+        self.distributed = distributed
+        self.timestamp = timestamp
+        self.meta = meta
+        self.logger = logger
+
+    def train(self, *args, **kwargs):
+        from mmdet.apis import train_detector
+        train_detector(
+            self.model,
+            self.datasets,
+            self.cfg,
+            distributed=self.distributed,
+            validate=(not self.cfg.no_validate),
+            timestamp=self.timestamp,
+            meta=self.meta)
+
+    def evaluate(self,
+                 checkpoint_path: str = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        cfg = self.cfg.evaluation
+        logger.info(f'eval cfg {cfg}')
+        logger.info(f'checkpoint_path {checkpoint_path}')
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 4fa3d766..5f0532ce 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -19,6 +19,7 @@ class CVTasks(object):
     # human face body related
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
+    card_detection = 'card-detection'
     face_recognition = 'face-recognition'
     facial_expression_recognition = 'facial-expression-recognition'
     face_2d_keypoints = 'face-2d-keypoints'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 06a9bbaa..2d420892 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -154,6 +154,54 @@ def draw_face_detection_result(img_path, detection_result):
     return img
 
 
+def draw_card_detection_result(img_path, detection_result):
+
+    def warp_img(src_img, kps, ratio):
+        short_size = 500
+        if ratio > 1:
+            obj_h = short_size
+            obj_w = int(obj_h * ratio)
+        else:
+            obj_w = short_size
+            obj_h = int(obj_w / ratio)
+        input_pts = np.float32([kps[0], kps[1], kps[2], kps[3]])
+        output_pts = np.float32([[0, obj_h - 1], [0, 0], [obj_w - 1, 0],
+                                 [obj_w - 1, obj_h - 1]])
+        M = cv2.getPerspectiveTransform(input_pts, output_pts)
+        obj_img = cv2.warpPerspective(src_img, M, (obj_w, obj_h))
+        return obj_img
+
+    bboxes = np.array(detection_result[OutputKeys.BOXES])
+    kpss = np.array(detection_result[OutputKeys.KEYPOINTS])
+    scores = np.array(detection_result[OutputKeys.SCORES])
+    img_list = []
+    ver_col = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0, 255, 255)]
+    img = cv2.imread(img_path)
+    img_list += [img]
+    assert img is not None, f"Can't read img: {img_path}"
+    for i in range(len(scores)):
+        bbox = bboxes[i].astype(np.int32)
+        kps = kpss[i].reshape(-1, 2).astype(np.int32)
+        _w = (kps[0][0] - kps[3][0])**2 + (kps[0][1] - kps[3][1])**2
+        _h = (kps[0][0] - kps[1][0])**2 + (kps[0][1] - kps[1][1])**2
+        ratio = 1.59 if _w >= _h else 1 / 1.59
+        card_img = warp_img(img, kps, ratio)
+        img_list += [card_img]
+        score = scores[i]
+        x1, y1, x2, y2 = bbox
+        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 4)
+        for k, kp in enumerate(kps):
+            cv2.circle(img, tuple(kp), 1, color=ver_col[k], thickness=10)
+        cv2.putText(
+            img,
+            f'{score:.2f}', (x1, y2),
+            1,
+            1.0, (0, 255, 0),
+            thickness=1,
+            lineType=8)
+    return img_list
+
+
 def created_boxed_image(image_in, box):
     image = load_image(image_in)
     img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
diff --git a/tests/pipelines/test_card_detection.py b/tests/pipelines/test_card_detection.py
new file mode 100644
index 00000000..d913f494
--- /dev/null
+++ b/tests/pipelines/test_card_detection.py
@@ -0,0 +1,66 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_card_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class CardDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.card_detection
+        self.model_id = 'damo/cv_resnet_carddetection_scrfd34gkps'
+
+    def show_result(self, img_path, detection_result):
+        img_list = draw_card_detection_result(img_path, detection_result)
+        for i, img in enumerate(img_list):
+            if i == 0:
+                cv2.imwrite('result.jpg', img_list[0])
+                print(
+                    f'Found {len(img_list)-1} cards, output written to {osp.abspath("result.jpg")}'
+                )
+            else:
+                cv2.imwrite(f'card_{i}.jpg', img_list[i])
+                save_path = osp.abspath(f'card_{i}.jpg')
+                print(f'detect card_{i}: {save_path}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_dataset(self):
+        input_location = ['data/test/images/card_detection.jpg']
+
+        dataset = MsDataset.load(input_location, target='image')
+        card_detection = pipeline(Tasks.card_detection, model=self.model_id)
+        # note that for dataset output, the inference-output is a Generator that can be iterated.
+        result = card_detection(dataset)
+        result = next(result)
+        self.show_result(input_location[0], result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        card_detection = pipeline(Tasks.card_detection, model=self.model_id)
+        img_path = 'data/test/images/card_detection.jpg'
+
+        result = card_detection(img_path)
+        self.show_result(img_path, result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        card_detection = pipeline(Tasks.card_detection)
+        img_path = 'data/test/images/card_detection.jpg'
+        result = card_detection(img_path)
+        self.show_result(img_path, result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index f89e9a94..31ae403e 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -25,10 +25,11 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_dataset(self):
-        input_location = ['data/test/images/face_detection.png']
+        input_location = ['data/test/images/face_detection2.jpeg']
 
         dataset = MsDataset.load(input_location, target='image')
-        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
+        face_detection = pipeline(
+            Tasks.face_detection, model=self.model_id, model_revision='v2')
         # note that for dataset output, the inference-output is a Generator that can be iterated.
         result = face_detection(dataset)
         result = next(result)
@@ -36,8 +37,9 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
-        img_path = 'data/test/images/face_detection.png'
+        face_detection = pipeline(
+            Tasks.face_detection, model=self.model_id, model_revision='v2')
+        img_path = 'data/test/images/face_detection2.jpeg'
 
         result = face_detection(img_path)
         self.show_result(img_path, result)
@@ -45,7 +47,7 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         face_detection = pipeline(Tasks.face_detection)
-        img_path = 'data/test/images/face_detection.png'
+        img_path = 'data/test/images/face_detection2.jpeg'
         result = face_detection(img_path)
         self.show_result(img_path, result)
 
diff --git a/tests/trainers/test_card_detection_scrfd_trainer.py b/tests/trainers/test_card_detection_scrfd_trainer.py
new file mode 100644
index 00000000..af87000b
--- /dev/null
+++ b/tests/trainers/test_card_detection_scrfd_trainer.py
@@ -0,0 +1,151 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+
+
+def _setup():
+    model_id = 'damo/cv_resnet_carddetection_scrfd34gkps'
+    # mini dataset only for unit test, remove '_mini' for full dataset.
+    ms_ds_syncards = MsDataset.load(
+        'SyntheticCards_mini', namespace='shaoxuan')
+
+    data_path = ms_ds_syncards.config_kwargs['split_config']
+    train_dir = data_path['train']
+    val_dir = data_path['validation']
+    train_root = train_dir + '/' + os.listdir(train_dir)[0] + '/'
+    val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+    max_epochs = 1  # run epochs in unit test
+
+    cache_path = snapshot_download(model_id)
+
+    tmp_dir = tempfile.TemporaryDirectory().name
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    return train_root, val_root, max_epochs, cache_path, tmp_dir
+
+
+def train_func(**kwargs):
+    trainer = build_trainer(
+        name=Trainers.card_detection_scrfd, default_args=kwargs)
+    trainer.train()
+
+
+class TestCardDetectionScrfdTrainerSingleGPU(unittest.TestCase):
+
+    def setUp(self):
+        print(('SingleGPU Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def _cfg_modify_fn(self, cfg):
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4  # batch size
+        return cfg
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_from_scratch(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.card_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.card_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class TestCardDetectionScrfdTrainerMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        print(('MultiGPUs Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+        cfg_file_path = os.path.join(self.cache_path, 'mmcv_scrfd.py')
+        cfg = Config.from_file(cfg_file_path)
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4
+        cfg.dump(cfg_file_path)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            launcher='pytorch')
+        self.start(train_func, num_gpus=2, **kwargs)
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_face_detection_scrfd_trainer.py b/tests/trainers/test_face_detection_scrfd_trainer.py
new file mode 100644
index 00000000..eb9440ef
--- /dev/null
+++ b/tests/trainers/test_face_detection_scrfd_trainer.py
@@ -0,0 +1,150 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+
+
+def _setup():
+    model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
+    # mini dataset only for unit test, remove '_mini' for full dataset.
+    ms_ds_widerface = MsDataset.load('WIDER_FACE_mini', namespace='shaoxuan')
+
+    data_path = ms_ds_widerface.config_kwargs['split_config']
+    train_dir = data_path['train']
+    val_dir = data_path['validation']
+    train_root = train_dir + '/' + os.listdir(train_dir)[0] + '/'
+    val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
+    max_epochs = 1  # run epochs in unit test
+
+    cache_path = snapshot_download(model_id, revision='v2')
+
+    tmp_dir = tempfile.TemporaryDirectory().name
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    return train_root, val_root, max_epochs, cache_path, tmp_dir
+
+
+def train_func(**kwargs):
+    trainer = build_trainer(
+        name=Trainers.face_detection_scrfd, default_args=kwargs)
+    trainer.train()
+
+
+class TestFaceDetectionScrfdTrainerSingleGPU(unittest.TestCase):
+
+    def setUp(self):
+        print(('SingleGPU Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def _cfg_modify_fn(self, cfg):
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4  # batch size
+        return cfg
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_from_scratch(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.face_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            cfg_modify_fn=self._cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.face_detection_scrfd, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class TestFaceDetectionScrfdTrainerMultiGpus(DistributedTestCase):
+
+    def setUp(self):
+        print(('MultiGPUs Testing %s.%s' %
+               (type(self).__name__, self._testMethodName)))
+        self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup(
+        )
+        cfg_file_path = os.path.join(self.cache_path, 'mmcv_scrfd.py')
+        cfg = Config.from_file(cfg_file_path)
+        cfg.checkpoint_config.interval = 1
+        cfg.log_config.interval = 10
+        cfg.evaluation.interval = 1
+        cfg.data.workers_per_gpu = 3
+        cfg.data.samples_per_gpu = 4
+        cfg.dump(cfg_file_path)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpus_finetune(self):
+        pretrain_epoch = 640
+        self.max_epochs += pretrain_epoch
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'),
+            work_dir=self.tmp_dir,
+            train_root=self.train_root,
+            val_root=self.val_root,
+            total_epochs=self.max_epochs,
+            resume_from=os.path.join(self.cache_path,
+                                     ModelFile.TORCH_MODEL_BIN_FILE),
+            launcher='pytorch')
+        self.start(train_func, num_gpus=2, **kwargs)
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        for i in range(pretrain_epoch, self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3863efc14d6da1786a93e7652d949d8d55ae8624 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:15:33 +0800
Subject: [PATCH 032/129] [to #42322933] add far field KWS trainer        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10275823

---
 data/test/audios/noise_2ch.wav                |   3 +
 .../test/audios/wake_word_with_label_xyxy.wav |   3 +
 modelscope/metainfo.py                        |   1 +
 modelscope/models/audio/kws/farfield/model.py |  63 ++--
 .../task_datasets/audio/__init__.py           |  21 ++
 .../audio/kws_farfield_dataset.py             | 280 ++++++++++++++++++
 .../trainers/audio/kws_farfield_trainer.py    | 279 +++++++++++++++++
 modelscope/utils/audio/audio_utils.py         |  18 ++
 requirements/audio.txt                        |   6 +-
 .../audio/test_kws_farfield_trainer.py        |  85 ++++++
 10 files changed, 721 insertions(+), 38 deletions(-)
 create mode 100644 data/test/audios/noise_2ch.wav
 create mode 100644 data/test/audios/wake_word_with_label_xyxy.wav
 create mode 100644 modelscope/msdatasets/task_datasets/audio/__init__.py
 create mode 100644 modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py
 create mode 100644 modelscope/trainers/audio/kws_farfield_trainer.py
 create mode 100644 tests/trainers/audio/test_kws_farfield_trainer.py

diff --git a/data/test/audios/noise_2ch.wav b/data/test/audios/noise_2ch.wav
new file mode 100644
index 00000000..c754e39a
--- /dev/null
+++ b/data/test/audios/noise_2ch.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d
+size 2327764
diff --git a/data/test/audios/wake_word_with_label_xyxy.wav b/data/test/audios/wake_word_with_label_xyxy.wav
new file mode 100644
index 00000000..b7999777
--- /dev/null
+++ b/data/test/audios/wake_word_with_label_xyxy.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150
+size 68524
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 0917bf3e..46c3b138 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -285,6 +285,7 @@ class Trainers(object):
 
     # audio trainers
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
 
 
 class Preprocessors(object):
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
index fea82194..d63d1e2a 100644
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -1,15 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from typing import Dict
-
-import torch
+from typing import Dict, Optional
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.audio.audio_utils import update_conf
+from modelscope.utils.constant import Tasks
 from .fsmn_sele_v2 import FSMNSeleNetV2
 
 
@@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel):
 
     MODEL_TXT = 'model.txt'
     SC_CONFIG = 'sound_connect.conf'
-    SC_CONF_ITEM_KWS_MODEL = '${kws_model}'
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 training: Optional[bool] = False,
+                 *args,
+                 **kwargs):
         """initialize the dfsmn model from the `model_dir` path.
 
         Args:
             model_dir (str): the model path.
         """
         super().__init__(model_dir, *args, **kwargs)
-        sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
-        model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
-        model_bin_file = os.path.join(model_dir,
-                                      ModelFile.TORCH_MODEL_BIN_FILE)
-        self._model = None
-        if os.path.exists(model_bin_file):
-            kwargs.pop('device')
-            self._model = FSMNSeleNetV2(*args, **kwargs)
-            checkpoint = torch.load(model_bin_file)
-            self._model.load_state_dict(checkpoint, strict=False)
-
-        self._sc = None
-        if os.path.exists(model_txt_file):
-            with open(sc_config_file) as f:
-                lines = f.readlines()
-            with open(sc_config_file, 'w') as f:
-                for line in lines:
-                    if self.SC_CONF_ITEM_KWS_MODEL in line:
-                        line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
-                                            model_txt_file)
-                    f.write(line)
-            import py_sound_connect
-            self._sc = py_sound_connect.SoundConnect(sc_config_file)
-            self.size_in = self._sc.bytesPerBlockIn()
-            self.size_out = self._sc.bytesPerBlockOut()
-
-        if self._model is None and self._sc is None:
-            raise Exception(
-                f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
-            )
+        if training:
+            self.model = FSMNSeleNetV2(*args, **kwargs)
+        else:
+            sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
+            model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
+            self._sc = None
+            if os.path.exists(model_txt_file):
+                conf_dict = dict(mode=56542, kws_model=model_txt_file)
+                update_conf(sc_config_file, sc_config_file, conf_dict)
+                import py_sound_connect
+                self._sc = py_sound_connect.SoundConnect(sc_config_file)
+                self.size_in = self._sc.bytesPerBlockIn()
+                self.size_out = self._sc.bytesPerBlockOut()
+            else:
+                raise Exception(
+                    f'Invalid model directory! Failed to load model file: {model_txt_file}.'
+                )
 
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        ...
+        return self.model.forward(input)
 
     def forward_decode(self, data: bytes):
         result = {'pcm': self._sc.process(data, self.size_out)}
diff --git a/modelscope/msdatasets/task_datasets/audio/__init__.py b/modelscope/msdatasets/task_datasets/audio/__init__.py
new file mode 100644
index 00000000..c62a8d9c
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/audio/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .kws_farfield_dataset import KWSDataset, KWSDataLoader
+
+else:
+    _import_structure = {
+        'kws_farfield_dataset': ['KWSDataset', 'KWSDataLoader'],
+    }
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py b/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py
new file mode 100644
index 00000000..8c518ec9
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py
@@ -0,0 +1,280 @@
+"""
+Used to prepare simulated data.
+"""
+import math
+import os.path
+import queue
+import threading
+import time
+
+import numpy as np
+import torch
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+BLOCK_DEC = 2
+BLOCK_CAT = 3
+FBANK_SIZE = 40
+LABEL_SIZE = 1
+LABEL_GAIN = 100.0
+
+
+class KWSDataset:
+    """
+    dataset for keyword spotting and vad
+    conf_basetrain:         basetrain configure file path
+    conf_finetune:          finetune configure file path, null allowed
+    numworkers:             no. of workers
+    basetrainratio:         basetrain workers ratio
+    numclasses:             no. of nn output classes, 2 classes to generate vad label
+    blockdec:               block decimation
+    blockcat:               block concatenation
+    """
+
+    def __init__(self,
+                 conf_basetrain,
+                 conf_finetune,
+                 numworkers,
+                 basetrainratio,
+                 numclasses,
+                 blockdec=BLOCK_CAT,
+                 blockcat=BLOCK_CAT):
+        super().__init__()
+        self.numclasses = numclasses
+        self.blockdec = blockdec
+        self.blockcat = blockcat
+        self.sims_base = []
+        self.sims_senior = []
+        self.setup_sims(conf_basetrain, conf_finetune, numworkers,
+                        basetrainratio)
+
+    def release(self):
+        for sim in self.sims_base:
+            del sim
+        for sim in self.sims_senior:
+            del sim
+        del self.base_conf
+        del self.senior_conf
+        logger.info('KWSDataset: Released.')
+
+    def setup_sims(self, conf_basetrain, conf_finetune, numworkers,
+                   basetrainratio):
+        if not os.path.exists(conf_basetrain):
+            raise ValueError(f'{conf_basetrain} does not exist!')
+        if not os.path.exists(conf_finetune):
+            raise ValueError(f'{conf_finetune} does not exist!')
+        import py_sound_connect
+        logger.info('KWSDataset init SoundConnect...')
+        num_base = math.ceil(numworkers * basetrainratio)
+        num_senior = numworkers - num_base
+        # hold by fields to avoid python releasing conf object
+        self.base_conf = py_sound_connect.ConfigFile(conf_basetrain)
+        self.senior_conf = py_sound_connect.ConfigFile(conf_finetune)
+        for i in range(num_base):
+            fs = py_sound_connect.FeatSimuKWS(self.base_conf.params)
+            self.sims_base.append(fs)
+        for i in range(num_senior):
+            self.sims_senior.append(
+                py_sound_connect.FeatSimuKWS(self.senior_conf.params))
+        logger.info('KWSDataset init SoundConnect finished.')
+
+    def getBatch(self, id):
+        """
+        Generate a data batch
+
+        Args:
+            id: worker id
+
+        Return: time x channel x feature, label
+        """
+        fs = self.get_sim(id)
+        fs.processBatch()
+        # get multi-channel feature vector size
+        featsize = fs.featSize()
+        # get label vector size
+        labelsize = fs.labelSize()
+        # get minibatch size (time dimension)
+        # batchsize = fs.featBatchSize()
+        # no. of fe output channels
+        numchs = featsize // FBANK_SIZE
+        # get raw data
+        fs_feat = fs.feat()
+        data = np.frombuffer(fs_feat, dtype='float32')
+        data = data.reshape((-1, featsize + labelsize))
+
+        # convert float label to int
+        label = data[:, FBANK_SIZE * numchs:]
+
+        if self.numclasses == 2:
+            # generate vad label
+            label[label > 0.0] = 1.0
+        else:
+            # generate kws label
+            label = np.round(label * LABEL_GAIN)
+            label[label > self.numclasses - 1] = 0.0
+
+        # decimated size
+        size1 = int(np.ceil(
+            label.shape[0] / self.blockdec)) - self.blockcat + 1
+
+        # label decimation
+        label1 = np.zeros((size1, LABEL_SIZE), dtype='float32')
+        for tau in range(size1):
+            label1[tau, :] = label[(tau + self.blockcat // 2)
+                                   * self.blockdec, :]
+
+        # feature decimation and concatenation
+        # time x channel x feature
+        featall = np.zeros((size1, numchs, FBANK_SIZE * self.blockcat),
+                           dtype='float32')
+        for n in range(numchs):
+            feat = data[:, FBANK_SIZE * n:FBANK_SIZE * (n + 1)]
+
+            for tau in range(size1):
+                for i in range(self.blockcat):
+                    featall[tau, n, FBANK_SIZE * i:FBANK_SIZE * (i + 1)] = \
+                        feat[(tau + i) * self.blockdec, :]
+
+        return torch.from_numpy(featall), torch.from_numpy(label1).long()
+
+    def get_sim(self, id):
+        num_base = len(self.sims_base)
+        if id < num_base:
+            fs = self.sims_base[id]
+        else:
+            fs = self.sims_senior[id - num_base]
+        return fs
+
+
+class Worker(threading.Thread):
+    """
+    id:                 worker id
+    dataset:            the dataset
+    pool:               queue as the global data buffer
+    """
+
+    def __init__(self, id, dataset, pool):
+        threading.Thread.__init__(self)
+
+        self.id = id
+        self.dataset = dataset
+        self.pool = pool
+        self.isrun = True
+        self.nn = 0
+
+    def run(self):
+        while self.isrun:
+            self.nn += 1
+            logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:1')
+            # get simulated minibatch
+            if self.isrun:
+                data = self.dataset.getBatch(self.id)
+            logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:2')
+
+            # put data into buffer
+            if self.isrun:
+                self.pool.put(data)
+            logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:3')
+
+        logger.info('KWSDataLoader: Worker {:02d} stopped.'.format(self.id))
+
+    def stopWorker(self):
+        """
+        stop the worker thread
+        """
+        self.isrun = False
+
+
+class KWSDataLoader:
+    """
+    dataset:            the dataset reference
+    batchsize:          data batch size
+    numworkers:         no. of workers
+    prefetch:           prefetch factor
+    """
+
+    def __init__(self, dataset, batchsize, numworkers, prefetch=2):
+        self.dataset = dataset
+        self.batchsize = batchsize
+        self.datamap = {}
+        self.isrun = True
+
+        # data queue
+        self.pool = queue.Queue(batchsize * prefetch)
+
+        # initialize workers
+        self.workerlist = []
+        for id in range(numworkers):
+            w = Worker(id, dataset, self.pool)
+            self.workerlist.append(w)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        while self.isrun:
+            # get data from common data pool
+            data = self.pool.get()
+            self.pool.task_done()
+
+            # group minibatches with the same shape
+            key = str(data[0].shape)
+
+            batchl = self.datamap.get(key)
+            if batchl is None:
+                batchl = []
+                self.datamap.update({key: batchl})
+
+            batchl.append(data)
+
+            # a full data batch collected
+            if len(batchl) >= self.batchsize:
+                featbatch = []
+                labelbatch = []
+
+                for feat, label in batchl:
+                    featbatch.append(feat)
+                    labelbatch.append(label)
+
+                batchl.clear()
+
+                feattensor = torch.stack(featbatch, dim=0)
+                labeltensor = torch.stack(labelbatch, dim=0)
+
+                if feattensor.shape[-2] == 1:
+                    logger.debug('KWSDataLoader: Basetrain batch.')
+                else:
+                    logger.debug('KWSDataLoader: Finetune batch.')
+
+                return feattensor, labeltensor
+
+        return None, None
+
+    def start(self):
+        """
+        start multi-thread data loader
+        """
+        for w in self.workerlist:
+            w.start()
+
+    def stop(self):
+        """
+        stop data loader
+        """
+        logger.info('KWSDataLoader: Stopping...')
+        self.isrun = False
+
+        for w in self.workerlist:
+            w.stopWorker()
+
+        while not self.pool.empty():
+            self.pool.get(block=True, timeout=0.001)
+
+        # wait workers terminated
+        for w in self.workerlist:
+            while not self.pool.empty():
+                self.pool.get(block=True, timeout=0.001)
+            w.join()
+        logger.info('KWSDataLoader: All worker stopped.')
diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py
new file mode 100644
index 00000000..a720ced5
--- /dev/null
+++ b/modelscope/trainers/audio/kws_farfield_trainer.py
@@ -0,0 +1,279 @@
+import datetime
+import math
+import os
+from typing import Callable, Dict, Optional
+
+import numpy as np
+import torch
+from torch import nn as nn
+from torch import optim as optim
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models import Model, TorchModel
+from modelscope.msdatasets.task_datasets.audio import KWSDataLoader, KWSDataset
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.audio.audio_utils import update_conf
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.device import create_device
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist, is_master)
+
+logger = get_logger()
+
+BASETRAIN_CONF_EASY = 'basetrain_easy'
+BASETRAIN_CONF_NORMAL = 'basetrain_normal'
+BASETRAIN_CONF_HARD = 'basetrain_hard'
+FINETUNE_CONF_EASY = 'finetune_easy'
+FINETUNE_CONF_NORMAL = 'finetune_normal'
+FINETUNE_CONF_HARD = 'finetune_hard'
+
+EASY_RATIO = 0.1
+NORMAL_RATIO = 0.6
+HARD_RATIO = 0.3
+BASETRAIN_RATIO = 0.5
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_dfsmn_kws_char_farfield)
+class KWSFarfieldTrainer(BaseTrainer):
+    DEFAULT_WORK_DIR = './work_dir'
+    conf_keys = (BASETRAIN_CONF_EASY, FINETUNE_CONF_EASY,
+                 BASETRAIN_CONF_NORMAL, FINETUNE_CONF_NORMAL,
+                 BASETRAIN_CONF_HARD, FINETUNE_CONF_HARD)
+
+    def __init__(self,
+                 model: str,
+                 work_dir: str,
+                 cfg_file: Optional[str] = None,
+                 arg_parse_fn: Optional[Callable] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 custom_conf: Optional[dict] = None,
+                 **kwargs):
+
+        if isinstance(model, str):
+            if os.path.exists(model):
+                self.model_dir = model if os.path.isdir(
+                    model) else os.path.dirname(model)
+            else:
+                self.model_dir = snapshot_download(
+                    model, revision=model_revision)
+            if cfg_file is None:
+                cfg_file = os.path.join(self.model_dir,
+                                        ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
+            self.model_dir = os.path.dirname(cfg_file)
+
+        super().__init__(cfg_file, arg_parse_fn)
+
+        self.model = self.build_model()
+        self.work_dir = work_dir
+        # the number of model output dimension
+        # should update config outside the trainer, if user need more wake word
+        self._num_classes = self.cfg.model.num_syn
+
+        if kwargs.get('launcher', None) is not None:
+            init_dist(kwargs['launcher'])
+
+        _, world_size = get_dist_info()
+        self._dist = world_size > 1
+
+        device_name = kwargs.get('device', 'gpu')
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+
+        self.device = create_device(device_name)
+        # model placement
+        if self.device.type == 'cuda':
+            self.model.to(self.device)
+
+        if 'max_epochs' not in kwargs:
+            assert hasattr(
+                self.cfg.train, 'max_epochs'
+            ), 'max_epochs is missing from the configuration file'
+            self._max_epochs = self.cfg.train.max_epochs
+        else:
+            self._max_epochs = kwargs['max_epochs']
+        self._train_iters = kwargs.get('train_iters_per_epoch', None)
+        self._val_iters = kwargs.get('val_iters_per_epoch', None)
+        if self._train_iters is None:
+            self._train_iters = self.cfg.train.train_iters_per_epoch
+        if self._val_iters is None:
+            self._val_iters = self.cfg.evaluation.val_iters_per_epoch
+        dataloader_config = self.cfg.train.dataloader
+        self._threads = kwargs.get('workers', None)
+        if self._threads is None:
+            self._threads = dataloader_config.workers_per_gpu
+        self._single_rate = BASETRAIN_RATIO
+        if 'single_rate' in kwargs:
+            self._single_rate = kwargs['single_rate']
+        self._batch_size = dataloader_config.batch_size_per_gpu
+        if 'model_bin' in kwargs:
+            model_bin_file = os.path.join(self.model_dir, kwargs['model_bin'])
+            checkpoint = torch.load(model_bin_file)
+            self.model.load_state_dict(checkpoint)
+        # build corresponding optimizer and loss function
+        lr = self.cfg.train.optimizer.lr
+        self.optimizer = optim.Adam(self.model.parameters(), lr)
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.data_val = None
+        self.json_log_path = os.path.join(self.work_dir,
+                                          '{}.log.json'.format(self.timestamp))
+        self.conf_files = []
+        for conf_key in self.conf_keys:
+            template_file = os.path.join(self.model_dir, conf_key)
+            conf_file = os.path.join(self.model_dir, f'{conf_key}.conf')
+            update_conf(template_file, conf_file, custom_conf[conf_key])
+            self.conf_files.append(conf_file)
+        self._current_epoch = 0
+        self.stages = (math.floor(self._max_epochs * EASY_RATIO),
+                       math.floor(self._max_epochs * NORMAL_RATIO),
+                       math.floor(self._max_epochs * HARD_RATIO))
+
+    def build_model(self) -> nn.Module:
+        """ Instantiate a pytorch model and return.
+
+        By default, we will create a model using config from configuration file. You can
+        override this method in a subclass.
+
+        """
+        model = Model.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, training=True)
+        if isinstance(model, TorchModel) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    def train(self, *args, **kwargs):
+        if not self.data_val:
+            self.gen_val()
+        logger.info('Start training...')
+        totaltime = datetime.datetime.now()
+
+        for stage, num_epoch in enumerate(self.stages):
+            self.run_stage(stage, num_epoch)
+
+        # total time spent
+        totaltime = datetime.datetime.now() - totaltime
+        logger.info('Total time spent: {:.2f} hours\n'.format(
+            totaltime.total_seconds() / 3600.0))
+
+    def run_stage(self, stage, num_epoch):
+        """
+        Run training stages with correspond data
+
+        Args:
+            stage: id of stage
+            num_epoch: the number of epoch to run in this stage
+        """
+        if num_epoch <= 0:
+            logger.warning(f'Invalid epoch number, stage {stage} exit!')
+            return
+        logger.info(f'Starting stage {stage}...')
+        dataset, dataloader = self.create_dataloader(
+            self.conf_files[stage * 2], self.conf_files[stage * 2 + 1])
+        it = iter(dataloader)
+        for _ in range(num_epoch):
+            self._current_epoch += 1
+            epochtime = datetime.datetime.now()
+            logger.info('Start epoch %d...', self._current_epoch)
+            loss_train_epoch = 0.0
+            validbatchs = 0
+            for bi in range(self._train_iters):
+                # prepare data
+                feat, label = next(it)
+                label = torch.reshape(label, (-1, ))
+                feat = to_device(feat, self.device)
+                label = to_device(label, self.device)
+                # apply model
+                self.optimizer.zero_grad()
+                predict = self.model(feat)
+                # calculate loss
+                loss = self.loss_fn(
+                    torch.reshape(predict, (-1, self._num_classes)), label)
+                if not np.isnan(loss.item()):
+                    loss.backward()
+                    self.optimizer.step()
+                    loss_train_epoch += loss.item()
+                    validbatchs += 1
+                train_result = 'Epoch: {:04d}/{:04d}, batch: {:04d}/{:04d}, loss: {:.4f}'.format(
+                    self._current_epoch, self._max_epochs, bi + 1,
+                    self._train_iters, loss.item())
+                logger.info(train_result)
+                self._dump_log(train_result)
+
+            # average training loss in one epoch
+            loss_train_epoch /= validbatchs
+            loss_val_epoch = self.evaluate('')
+            val_result = 'Evaluate epoch: {:04d}, loss_train: {:.4f}, loss_val: {:.4f}'.format(
+                self._current_epoch, loss_train_epoch, loss_val_epoch)
+            logger.info(val_result)
+            self._dump_log(val_result)
+            # check point
+            ckpt_name = 'checkpoint_{:04d}_loss_train_{:.4f}_loss_val_{:.4f}.pth'.format(
+                self._current_epoch, loss_train_epoch, loss_val_epoch)
+            torch.save(self.model, os.path.join(self.work_dir, ckpt_name))
+            # time spent per epoch
+            epochtime = datetime.datetime.now() - epochtime
+            logger.info('Epoch {:04d} time spent: {:.2f} hours'.format(
+                self._current_epoch,
+                epochtime.total_seconds() / 3600.0))
+        dataloader.stop()
+        dataset.release()
+        logger.info(f'Stage {stage} is finished.')
+
+    def gen_val(self):
+        """
+        generate validation set
+        """
+        logger.info('Start generating validation set...')
+        dataset, dataloader = self.create_dataloader(self.conf_files[2],
+                                                     self.conf_files[3])
+        it = iter(dataloader)
+
+        self.data_val = []
+        for bi in range(self._val_iters):
+            logger.info('Iterating validation data %d', bi)
+            feat, label = next(it)
+            label = torch.reshape(label, (-1, ))
+            self.data_val.append([feat, label])
+
+        dataloader.stop()
+        dataset.release()
+        logger.info('Finish generating validation set!')
+
+    def create_dataloader(self, base_path, finetune_path):
+        dataset = KWSDataset(base_path, finetune_path, self._threads,
+                             self._single_rate, self._num_classes)
+        dataloader = KWSDataLoader(
+            dataset, batchsize=self._batch_size, numworkers=self._threads)
+        dataloader.start()
+        return dataset, dataloader
+
+    def evaluate(self, checkpoint_path: str, *args,
+                 **kwargs) -> Dict[str, float]:
+        logger.info('Start validation...')
+        loss_val_epoch = 0.0
+
+        with torch.no_grad():
+            for feat, label in self.data_val:
+                feat = to_device(feat, self.device)
+                label = to_device(label, self.device)
+                # apply model
+                predict = self.model(feat)
+                # calculate loss
+                loss = self.loss_fn(
+                    torch.reshape(predict, (-1, self._num_classes)), label)
+                loss_val_epoch += loss.item()
+        logger.info('Finish validation.')
+        return loss_val_epoch / self._val_iters
+
+    def _dump_log(self, msg):
+        if is_master():
+            with open(self.json_log_path, 'a+') as f:
+                f.write(msg)
+                f.write('\n')
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 4c2c45cc..647d9521 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import re
 import struct
 from typing import Union
 from urllib.parse import urlparse
@@ -37,6 +38,23 @@ def audio_norm(x):
     return x
 
 
+def update_conf(origin_config_file, new_config_file, conf_item: [str, str]):
+
+    def repl(matched):
+        key = matched.group(1)
+        if key in conf_item:
+            return conf_item[key]
+        else:
+            return None
+
+    with open(origin_config_file) as f:
+        lines = f.readlines()
+    with open(new_config_file, 'w') as f:
+        for line in lines:
+            line = re.sub(r'\$\{(.*)\}', repl, line)
+            f.write(line)
+
+
 def extract_pcm_from_wav(wav: bytes) -> bytes:
     data = wav
     if len(data) > 44:
diff --git a/requirements/audio.txt b/requirements/audio.txt
index d22ad8f1..742cf166 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -14,7 +14,11 @@ nltk
 numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
-py_sound_connect
+ptflops
+py_sound_connect>=0.1
+pytorch_wavelets
+PyWavelets>=1.0.0
+scikit-learn
 SoundFile>0.10
 sox
 torchaudio
diff --git a/tests/trainers/audio/test_kws_farfield_trainer.py b/tests/trainers/audio/test_kws_farfield_trainer.py
new file mode 100644
index 00000000..2631a542
--- /dev/null
+++ b/tests/trainers/audio/test_kws_farfield_trainer.py
@@ -0,0 +1,85 @@
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.test_utils import test_level
+
+POS_FILE = 'data/test/audios/wake_word_with_label_xyxy.wav'
+NEG_FILE = 'data/test/audios/speech_with_noise.wav'
+NOISE_FILE = 'data/test/audios/speech_with_noise.wav'
+INTERF_FILE = 'data/test/audios/speech_with_noise.wav'
+REF_FILE = 'data/test/audios/farend_speech.wav'
+NOISE_2CH_FILE = 'data/test/audios/noise_2ch.wav'
+
+
+class TestKwsFarfieldTrainer(unittest.TestCase):
+    REVISION = 'beta'
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        print(f'tmp dir: {self.tmp_dir}')
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya'
+
+        train_pos_list = self.create_list('pos.list', POS_FILE)
+        train_neg_list = self.create_list('neg.list', NEG_FILE)
+        train_noise1_list = self.create_list('noise.list', NOISE_FILE)
+        train_noise2_list = self.create_list('noise_2ch.list', NOISE_2CH_FILE)
+        train_interf_list = self.create_list('interf.list', INTERF_FILE)
+        train_ref_list = self.create_list('ref.list', REF_FILE)
+
+        base_dict = dict(
+            train_pos_list=train_pos_list,
+            train_neg_list=train_neg_list,
+            train_noise1_list=train_noise1_list)
+        fintune_dict = dict(
+            train_pos_list=train_pos_list,
+            train_neg_list=train_neg_list,
+            train_noise1_list=train_noise1_list,
+            train_noise2_type='1',
+            train_noise1_ratio='0.2',
+            train_noise2_list=train_noise2_list,
+            train_interf_list=train_interf_list,
+            train_ref_list=train_ref_list)
+        self.custom_conf = dict(
+            basetrain_easy=base_dict,
+            basetrain_normal=base_dict,
+            basetrain_hard=base_dict,
+            finetune_easy=fintune_dict,
+            finetune_normal=fintune_dict,
+            finetune_hard=fintune_dict)
+
+    def create_list(self, list_name, audio_file):
+        pos_list_file = os.path.join(self.tmp_dir, list_name)
+        with open(pos_list_file, 'w') as f:
+            for i in range(10):
+                f.write(f'{os.path.join(os.getcwd(), audio_file)}\n')
+        train_pos_list = f'{pos_list_file}, 1.0'
+        return train_pos_list
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_normal(self):
+        kwargs = dict(
+            model=self.model_id,
+            work_dir=self.tmp_dir,
+            model_revision=self.REVISION,
+            workers=2,
+            max_epochs=2,
+            train_iters_per_epoch=2,
+            val_iters_per_epoch=1,
+            custom_conf=self.custom_conf)
+
+        trainer = build_trainer(
+            Trainers.speech_dfsmn_kws_char_farfield, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files,
+                      f'work_dir:{self.tmp_dir}')

From 144ffee2cfaa89389930cba4c991ce03493502d2 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:16:07 +0800
Subject: [PATCH 033/129] [to #42322933] Add explict model id in tts UT

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371244
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371244
---
 tests/pipelines/test_text_to_speech.py | 89 +++++++++++---------------
 1 file changed, 36 insertions(+), 53 deletions(-)

diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 0caf1c84..9a1cd7b1 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -27,67 +27,50 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
 
     def setUp(self) -> None:
         self.task = Tasks.text_to_speech
-        zhcn_text = '今天北京天气怎么样'
-        en_text = 'How is the weather in Beijing?'
-        zhcn_voice = ['zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo']
-        enus_voice = ['andy', 'annie']
-        engb_voice = ['luca', 'luna']
-        self.tts_test_cases = []
-        for voice in zhcn_voice:
-            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
-                                                                      'zh-cn')
-            self.tts_test_cases.append({
-                'voice': voice,
-                'model_id': model_id,
-                'text': zhcn_text
-            })
-        for voice in enus_voice:
-            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
-                                                                      'en-us')
-            self.tts_test_cases.append({
-                'voice': voice,
-                'model_id': model_id,
-                'text': en_text
-            })
-        for voice in engb_voice:
-            model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice,
-                                                                      'en-gb')
-            self.tts_test_cases.append({
-                'voice': voice,
-                'model_id': model_id,
-                'text': en_text
-            })
-        zhcn_model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k'
-        enus_model_id = 'damo/speech_sambert-hifigan_tts_en-us_16k'
-        engb_model_id = 'damo/speech_sambert-hifigan_tts_en-gb_16k'
-        self.tts_test_cases.append({
-            'voice': 'zhcn',
-            'model_id': zhcn_model_id,
-            'text': zhcn_text
-        })
-        self.tts_test_cases.append({
-            'voice': 'enus',
-            'model_id': enus_model_id,
-            'text': en_text
-        })
-        self.tts_test_cases.append({
-            'voice': 'engb',
-            'model_id': engb_model_id,
-            'text': en_text
-        })
+        self.zhcn_text = '今天北京天气怎么样'
+        self.en_text = 'How is the weather in Beijing?'
+        self.zhcn_voices = [
+            'zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo', 'zhcn'
+        ]
+        self.zhcn_models = [
+            'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zhizhe_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zhiyan_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zhibei_emo_zh-cn_16k',
+            'damo/speech_sambert-hifigan_tts_zh-cn_16k'
+        ]
+        self.en_voices = ['luca', 'luna', 'andy', 'annie', 'engb', 'enus']
+        self.en_models = [
+            'damo/speech_sambert-hifigan_tts_luca_en-gb_16k',
+            'damo/speech_sambert-hifigan_tts_luna_en-gb_16k',
+            'damo/speech_sambert-hifigan_tts_andy_en-us_16k',
+            'damo/speech_sambert-hifigan_tts_annie_en-us_16k',
+            'damo/speech_sambert-hifigan_tts_en-gb_16k',
+            'damo/speech_sambert-hifigan_tts_en-us_16k'
+        ]
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_pipeline(self):
-        for case in self.tts_test_cases:
-            logger.info('test %s' % case['voice'])
+        for i in range(len(self.zhcn_voices)):
+            logger.info('test %s' % self.zhcn_voices[i])
             model = Model.from_pretrained(
-                model_name_or_path=case['model_id'], revision='pytorch_am')
+                model_name_or_path=self.zhcn_models[i], revision='pytorch_am')
             sambert_hifigan_tts = pipeline(task=self.task, model=model)
             self.assertTrue(sambert_hifigan_tts is not None)
-            output = sambert_hifigan_tts(input=case['text'])
+            output = sambert_hifigan_tts(input=self.zhcn_text)
             self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
             pcm = output[OutputKeys.OUTPUT_PCM]
-            write('output_%s.wav' % case['voice'], 16000, pcm)
+            write('output_%s.wav' % self.zhcn_voices[i], 16000, pcm)
+        for i in range(len(self.en_voices)):
+            logger.info('test %s' % self.en_voices[i])
+            model = Model.from_pretrained(
+                model_name_or_path=self.en_models[i], revision='pytorch_am')
+            sambert_hifigan_tts = pipeline(task=self.task, model=model)
+            self.assertTrue(sambert_hifigan_tts is not None)
+            output = sambert_hifigan_tts(input=self.en_text)
+            self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
+            pcm = output[OutputKeys.OUTPUT_PCM]
+            write('output_%s.wav' % self.en_voices[i], 16000, pcm)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From f63d7f18f14dc919297ec104bef56cf2e1990bfc Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:39:56 +0800
Subject: [PATCH 034/129] [to #42322933]remove sleep in train_loop        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9910419

---
 modelscope/trainers/trainer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 4c21d63f..9eaff762 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -828,7 +828,6 @@ class EpochBasedTrainer(BaseTrainer):
         self.model.train()
         for _ in range(self._epoch, self._max_epochs):
             self.invoke_hook(TrainerStages.before_train_epoch)
-            time.sleep(2)  # Prevent possible deadlock during epoch transition
             for i, data_batch in enumerate(data_loader):
                 if i < self.inner_iter:
                     # inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch.
@@ -852,7 +851,6 @@ class EpochBasedTrainer(BaseTrainer):
             self._inner_iter = 0
             self._epoch += 1
 
-        time.sleep(1)  # wait for some hooks like loggers to finish
         self.invoke_hook(TrainerStages.after_run)
 
     def evaluation_loop(self, data_loader, metric_classes):

From 0eb823b76490bb3249bf1420143873293a132fb7 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 13 Oct 2022 10:52:40 +0800
Subject: [PATCH 035/129] [to #42322933] support t5_with_translation        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10383770

    * T5 support translate
---
 modelscope/metainfo.py                        |  4 ++
 .../nlp/text2text_generation_pipeline.py      | 39 ++++++++++++++++---
 modelscope/preprocessors/nlp/nlp_base.py      |  3 +-
 modelscope/utils/config.py                    | 10 +++++
 tests/pipelines/test_text2text_generation.py  | 26 +++++++------
 5 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 46c3b138..59c779e9 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -228,6 +228,9 @@ class Pipelines(object):
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    translation_en_to_de = 'translation_en_to_de'  # keep it underscore
+    translation_en_to_ro = 'translation_en_to_ro'  # keep it underscore
+    translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -314,6 +317,7 @@ class Preprocessors(object):
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
     text2text_gen_preprocessor = 'text2text-gen-preprocessor'
+    text2text_translate_preprocessor = 'text2text-translate-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
     nli_tokenizer = 'nli-tokenizer'
diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
index 21aacf54..a739df69 100644
--- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py
@@ -1,21 +1,35 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
+from numpy import isin
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Input, Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import Text2TextGenerationPreprocessor
+from modelscope.utils.config import use_task_specific_params
 from modelscope.utils.constant import Tasks
 
 __all__ = ['Text2TextGenerationPipeline']
 
+TRANSLATE_PIPELINES = [
+    Pipelines.translation_en_to_de,
+    Pipelines.translation_en_to_ro,
+    Pipelines.translation_en_to_fr,
+]
+
 
 @PIPELINES.register_module(
     Tasks.text2text_generation, module_name=Pipelines.text2text_generation)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro)
+@PIPELINES.register_module(
+    Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr)
 class Text2TextGenerationPipeline(Pipeline):
 
     def __init__(
@@ -39,13 +53,13 @@ class Text2TextGenerationPipeline(Pipeline):
 
             Example:
             >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(task='text-generation',
-            >>>    model='damo/nlp_palm2.0_text-generation_chinese-base')
-            >>> sentence1 = '本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：'
-            >>>     '1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代'
+            >>> pipeline_ins = pipeline(task='text2text-generation',
+            >>>    model='damo/nlp_t5_text2text-generation_chinese-base')
+            >>> sentence1 = '中国的首都位于<extra_id_0>。'
             >>> print(pipeline_ins(sentence1))
             >>> # Or use the dict input:
             >>> print(pipeline_ins({'sentence': sentence1}))
+            >>> # 北京
 
             To view other examples plese check the tests/pipelines/test_text_generation.py.
         """
@@ -56,9 +70,22 @@ class Text2TextGenerationPipeline(Pipeline):
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 128))
         self.tokenizer = preprocessor.tokenizer
+        self.pipeline = model.pipeline.type
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks
+        """
+        if not isinstance(inputs, str):
+            raise ValueError(f'Not supported input type: {type(inputs)}')
+
+        if self.pipeline in TRANSLATE_PIPELINES:
+            use_task_specific_params(self.model, self.pipeline)
+            inputs = self.model.config.prefix + inputs
+
+        return super().preprocess(inputs, **preprocess_params)
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
 
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index a9be0cb0..bec7e4e1 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -12,7 +12,8 @@ from modelscope.metainfo import Models, Preprocessors
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.config import (Config, ConfigFields,
+                                     use_task_specific_params)
 from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 0b966bef..c4fa3c1b 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -633,6 +633,16 @@ def check_config(cfg: Union[str, ConfigDict]):
         check_attr(ConfigFields.evaluation)
 
 
+def use_task_specific_params(model, task):
+    """Update config with summarization specific params."""
+    task_specific_params = model.config.task_specific_params
+
+    if task_specific_params is not None:
+        pars = task_specific_params.get(task, {})
+        logger.info(f'using task specific params for {task}: {pars}')
+        model.config.update(pars)
+
+
 class JSONIteratorEncoder(json.JSONEncoder):
     """Implement this method in order that supporting arbitrary iterators, it returns
         a serializable object for ``obj``, or calls the base implementation
diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py
index 2506547e..d90263c4 100644
--- a/tests/pipelines/test_text2text_generation.py
+++ b/tests/pipelines/test_text2text_generation.py
@@ -15,42 +15,44 @@ from modelscope.utils.test_utils import test_level
 class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.model_id = 'damo/t5-cn-base-test'
-        self.input = '中国的首都位于<extra_id_0>。'
+        self.model_id_generate = 'damo/t5-cn-base-test'
+        self.input_generate = '中国的首都位于<extra_id_0>。'
+        self.model_id_translate = 'damo/t5-translate-base-test'
+        self.input_translate = 'My name is Wolfgang and I live in Berlin'
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_T5(self):
-        cache_path = snapshot_download(self.model_id)
-        model = T5ForConditionalGeneration(cache_path)
+        cache_path = snapshot_download(self.model_id_generate)
+        model = T5ForConditionalGeneration.from_pretrained(cache_path)
         preprocessor = Text2TextGenerationPreprocessor(cache_path)
         pipeline1 = Text2TextGenerationPipeline(model, preprocessor)
         pipeline2 = pipeline(
             Tasks.text2text_generation, model=model, preprocessor=preprocessor)
         print(
-            f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
+            f'pipeline1: {pipeline1(self.input_generate)}\npipeline2: {pipeline2(self.input_generate)}'
         )
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_instance(self):
-        model = Model.from_pretrained(self.model_id)
+        model = Model.from_pretrained(self.model_id_translate)
         preprocessor = Text2TextGenerationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.text2text_generation,
             model=model,
             preprocessor=preprocessor)
-        print(pipeline_ins(self.input))
+        print(pipeline_ins(self.input_translate))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_pipeline_with_model_id(self):
         pipeline_ins = pipeline(
-            task=Tasks.text2text_generation, model=self.model_id)
-        print(pipeline_ins(self.input))
+            task=Tasks.text2text_generation, model=self.model_id_translate)
+        print(pipeline_ins(self.input_translate))
 
     @unittest.skip(
         'only for test cases, there is no default official model yet')
     def test_run_pipeline_without_model_id(self):
         pipeline_ins = pipeline(task=Tasks.text2text_generation)
-        print(pipeline_ins(self.input))
+        print(pipeline_ins(self.input_generate))
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From 2d50c812df09c3423fe8ccbc12b15eaae5706c79 Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Thu, 13 Oct 2022 13:48:11 +0800
Subject: [PATCH 036/129] [to #42322933] support finetune on
 cv/hand_2d_keypoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加2d手部关键点检测finetune功能
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371710
---
 modelscope/metainfo.py                        |  2 +
 .../models/cv/hand_2d_keypoints/__init__.py   | 20 ++++++
 .../cv/hand_2d_keypoints/hand_2d_keypoints.py | 16 +++++
 .../cv/hand_2d_keypoints/__init__.py          | 22 ++++++
 .../hand_2d_keypoints_dataset.py              | 38 ++++++++++
 .../test_easycv_trainer_hand_2d_keypoints.py  | 72 +++++++++++++++++++
 6 files changed, 170 insertions(+)
 create mode 100644 modelscope/models/cv/hand_2d_keypoints/__init__.py
 create mode 100644 modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
 create mode 100644 modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py
 create mode 100644 modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py
 create mode 100644 tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 59c779e9..2e3fed98 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -50,6 +50,7 @@ class Models(object):
     # EasyCV models
     yolox = 'YOLOX'
     segformer = 'Segformer'
+    hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
     image_object_detection_auto = 'image-object-detection-auto'
 
     # nlp models
@@ -439,6 +440,7 @@ class Datasets(object):
     """
     ClsDataset = 'ClsDataset'
     Face2dKeypointsDataset = 'Face2dKeypointsDataset'
+    HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
     HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
diff --git a/modelscope/models/cv/hand_2d_keypoints/__init__.py b/modelscope/models/cv/hand_2d_keypoints/__init__.py
new file mode 100644
index 00000000..2b06f19a
--- /dev/null
+++ b/modelscope/models/cv/hand_2d_keypoints/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_2d_keypoints import Hand2dKeyPoints
+
+else:
+    _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
new file mode 100644
index 00000000..15a97c30
--- /dev/null
+++ b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
@@ -0,0 +1,16 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.models.pose import TopDown
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
+class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        TopDown.__init__(self, *args, **kwargs)
diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py b/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py
new file mode 100644
index 00000000..5c1c72c1
--- /dev/null
+++ b/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .hand_2d_keypoints_dataset import Hand2DKeypointDataset
+
+else:
+    _import_structure = {
+        'hand_2d_keypoints_dataset': ['Hand2DKeypointDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py
new file mode 100644
index 00000000..89ee0bb8
--- /dev/null
+++ b/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from easycv.datasets.pose import \
+    HandCocoWholeBodyDataset as _HandCocoWholeBodyDataset
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.hand_2d_keypoints,
+    module_name=Datasets.HandCocoWholeBodyDataset)
+class HandCocoWholeBodyDataset(EasyCVBaseDataset, _HandCocoWholeBodyDataset):
+    """EasyCV dataset for human hand 2d keypoints.
+
+    Args:
+        split_config (dict): Dataset root path from MSDataset, e.g.
+            {"train":"local cache path"} or {"evaluation":"local cache path"}.
+        preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. Not support yet.
+        mode: Training or Evaluation.
+    """
+
+    def __init__(self,
+                 split_config=None,
+                 preprocessor=None,
+                 mode=None,
+                 *args,
+                 **kwargs) -> None:
+        EasyCVBaseDataset.__init__(
+            self,
+            split_config=split_config,
+            preprocessor=preprocessor,
+            mode=mode,
+            args=args,
+            kwargs=kwargs)
+        _HandCocoWholeBodyDataset.__init__(self, *args, **kwargs)
diff --git a/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py b/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py
new file mode 100644
index 00000000..270ecbc4
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py
@@ -0,0 +1,72 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestHand2dKeypoints(unittest.TestCase):
+    model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _train(self):
+        cfg_options = {'train.max_epochs': 20}
+
+        trainer_name = Trainers.easycv
+
+        train_dataset = MsDataset.load(
+            dataset_name='cv_hand_2d_keypoints_coco_wholebody',
+            namespace='chenhyer',
+            split='subtrain',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        eval_dataset = MsDataset.load(
+            dataset_name='cv_hand_2d_keypoints_coco_wholebody',
+            namespace='chenhyer',
+            split='subtrain',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_single_gpu(self):
+        self._train()
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_10.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_20.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 14e52b308aa6e67564b230ae49b0615615d752ec Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 13 Oct 2022 14:41:26 +0800
Subject: [PATCH 037/129] fix token classification bugs         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10385225

    * fix token classification bugs
---
 modelscope/models/nlp/bert/__init__.py        | 12 ++------
 modelscope/models/nlp/bert/modeling_bert.py   | 25 ++++++++--------
 modelscope/models/nlp/token_classification.py | 29 +++++++++++++++++--
 .../nlp/token_classification_pipeline.py      |  7 ++++-
 4 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
index 705d9519..cca79c2f 100644
--- a/modelscope/models/nlp/bert/__init__.py
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .modeling_bert import (
-        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         BertForMaskedLM,
         BertForMultipleChoice,
         BertForNextSentencePrediction,
@@ -20,21 +19,14 @@ if TYPE_CHECKING:
         load_tf_weights_in_bert,
     )
 
-    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
-    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
-    from .tokenization_bert_fast import BertTokenizerFast
+    from .configuration_bert import BertConfig, BertOnnxConfig
 
 else:
     _import_structure = {
-        'configuration_bert':
-        ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'],
-        'tokenization_bert':
-        ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'],
+        'configuration_bert': ['BertConfig', 'BertOnnxConfig'],
     }
-    _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast']
 
     _import_structure['modeling_bert'] = [
-        'BERT_PRETRAINED_MODEL_ARCHIVE_LIST',
         'BertForMaskedLM',
         'BertForMultipleChoice',
         'BertForNextSentencePrediction',
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
index f8fd5994..e91a6433 100755
--- a/modelscope/models/nlp/bert/modeling_bert.py
+++ b/modelscope/models/nlp/bert/modeling_bert.py
@@ -1872,19 +1872,18 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     @add_start_docstrings_to_model_forward(
         BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
         *optional*):
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
index c63e8037..e58967a5 100644
--- a/modelscope/models/nlp/token_classification.py
+++ b/modelscope/models/nlp/token_classification.py
@@ -176,7 +176,7 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
 
 @MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
 @MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
-class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
+class BertForTokenClassification(TokenClassification, BertPreTrainedModel):
     """Bert token classification model.
 
         Inherited from TokenClassificationBase.
@@ -187,7 +187,7 @@ class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
 
     def __init__(self, config, model_dir):
         if hasattr(config, 'base_model_prefix'):
-            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
+            BertForTokenClassification.base_model_prefix = config.base_model_prefix
         super().__init__(config, model_dir)
 
     def build_base_model(self):
@@ -218,3 +218,28 @@ class BertForSequenceClassification(TokenClassification, BertPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             **kwargs)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_dir = kwargs.get('model_dir')
+        num_labels = kwargs.get('num_labels')
+        if num_labels is None:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None and len(label2id) > 0:
+                num_labels = len(label2id)
+
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+        return super(BertPreTrainedModel,
+                     BertForTokenClassification).from_pretrained(
+                         pretrained_model_name_or_path=kwargs.get('model_dir'),
+                         model_dir=kwargs.get('model_dir'),
+                         **model_args)
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 5367c1a8..c57dbf20 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -40,7 +40,12 @@ class TokenClassificationPipeline(Pipeline):
                 sequence_length=kwargs.pop('sequence_length', 128))
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = getattr(model, 'id2label')
+        if hasattr(model, 'id2label'):
+            self.id2label = getattr(model, 'id2label')
+        else:
+            model_config = getattr(model, 'config')
+            self.id2label = getattr(model_config, 'id2label')
+
         assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
                                           'as a parameter or make sure the preprocessor has the attribute.'
 

From 383452b0a4be12d3a5d15417042d7ccf3e285301 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Thu, 13 Oct 2022 17:16:17 +0800
Subject: [PATCH 038/129] [to #45452180] python 3.10.x compatible         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10394282

    * python 3.10.x compatible
---
 modelscope/utils/tensor_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index b68a639c..406d671f 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
-from collections import Mapping
+from collections.abc import Mapping
 
 
 def torch_nested_numpify(tensors):

From 5bdb8fb78b5cb0d01431891d8e55cb5510a4ece4 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Thu, 13 Oct 2022 18:30:06 +0800
Subject: [PATCH 039/129] [to #45451935]fix: add create model detail log for
 create failed.         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10382795

---
 modelscope/hub/api.py    | 24 +++++++++++-------------
 modelscope/hub/errors.py | 17 +++++++++++++++--
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 8dcfa5b0..214045dd 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -24,8 +24,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DownloadMode)
 from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
-                     datahub_raise_on_error, handle_http_response, is_ok,
-                     raise_on_error)
+                     datahub_raise_on_error, handle_http_post_error,
+                     handle_http_response, is_ok, raise_on_error)
 from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
                           model_id_to_group_owner_name)
 
@@ -105,17 +105,15 @@ class HubApi:
 
         path = f'{self.endpoint}/api/v1/models'
         owner_or_group, name = model_id_to_group_owner_name(model_id)
-        r = requests.post(
-            path,
-            json={
-                'Path': owner_or_group,
-                'Name': name,
-                'ChineseName': chinese_name,
-                'Visibility': visibility,  # server check
-                'License': license
-            },
-            cookies=cookies)
-        r.raise_for_status()
+        body = {
+            'Path': owner_or_group,
+            'Name': name,
+            'ChineseName': chinese_name,
+            'Visibility': visibility,  # server check
+            'License': license
+        }
+        r = requests.post(path, json=body, cookies=cookies)
+        handle_http_post_error(r, path, body)
         raise_on_error(r.json())
         model_repo_url = f'{get_endpoint()}/{model_id}'
         return model_repo_url
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index c095a6ec..fb483287 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -4,6 +4,10 @@ from http import HTTPStatus
 
 from requests.exceptions import HTTPError
 
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
 
 class NotExistError(Exception):
     pass
@@ -45,15 +49,24 @@ def is_ok(rsp):
     return rsp['Code'] == HTTPStatus.OK and rsp['Success']
 
 
+def handle_http_post_error(response, url, request_body):
+    try:
+        response.raise_for_status()
+    except HTTPError as error:
+        logger.error('Request %s with body: %s exception, respoonse body: %s' %
+                     (url, request_body, response.body))
+        raise error
+
+
 def handle_http_response(response, logger, cookies, model_id):
     try:
         response.raise_for_status()
-    except HTTPError:
+    except HTTPError as error:
         if cookies is None:  # code in [403] and
             logger.error(
                 f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                 private. Please login first.')
-        raise
+        raise error
 
 
 def raise_on_error(rsp):

From 6818ffdc8e598b5a8aeb525c05549b9bce5b3784 Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Thu, 13 Oct 2022 19:42:19 +0800
Subject: [PATCH 040/129] [to #42322933] feat: optimize ANS metric value       
  Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10399100

---
 modelscope/metrics/audio_noise_metric.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
index f26db46d..8555e95b 100644
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric):
         total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
         return {
             'total_loss': total_loss.item(),
-            'avg_sisnr': avg_sisnr.item(),
+            # model use opposite number of sisnr as a calculation shortcut.
+            # revert it in evaluation result
+            'avg_sisnr': -avg_sisnr.item(),
             MetricKeys.AVERAGE_LOSS: avg_loss.item()
         }

From c5c14ad60a8ba573263078892ada19f47698fc1c Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Thu, 13 Oct 2022 22:25:57 +0800
Subject: [PATCH 041/129] [to #42322933]fix psnr/ssim metrics for NAFNet (image
 denoise)         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10403246

---
 modelscope/metrics/image_denoise_metric.py    | 193 +++++++++++++-----
 .../image_denoise/nafnet_for_image_denoise.py |  10 +-
 .../msdatasets/task_datasets/__init__.py      |   1 +
 3 files changed, 149 insertions(+), 55 deletions(-)

diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py
index c6df8df1..1692f299 100644
--- a/modelscope/metrics/image_denoise_metric.py
+++ b/modelscope/metrics/image_denoise_metric.py
@@ -1,14 +1,16 @@
-# The code is modified based on BasicSR metrics:
-# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
+# ------------------------------------------------------------------------
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# ------------------------------------------------------------------------
+# modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py
+# ------------------------------------------------------------------------
 from typing import Dict
 
 import cv2
 import numpy as np
+import torch
 
 from modelscope.metainfo import Metrics
 from modelscope.utils.registry import default_group
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
 from .base import Metric
 from .builder import METRICS, MetricKeys
 
@@ -22,16 +24,15 @@ class ImageDenoiseMetric(Metric):
     label_name = 'target'
 
     def __init__(self):
+        super(ImageDenoiseMetric, self).__init__()
         self.preds = []
         self.labels = []
 
     def add(self, outputs: Dict, inputs: Dict):
         ground_truths = outputs[ImageDenoiseMetric.label_name]
         eval_results = outputs[ImageDenoiseMetric.pred_name]
-        self.preds.append(
-            torch_nested_numpify(torch_nested_detach(eval_results)))
-        self.labels.append(
-            torch_nested_numpify(torch_nested_detach(ground_truths)))
+        self.preds.append(eval_results)
+        self.labels.append(ground_truths)
 
     def evaluate(self):
         psnr_list, ssim_list = [], []
@@ -69,80 +70,117 @@ def reorder_image(img, input_order='HWC'):
     return img
 
 
-def calculate_psnr(img, img2, crop_border, input_order='HWC', **kwargs):
+def calculate_psnr(img1, img2, crop_border, input_order='HWC'):
     """Calculate PSNR (Peak Signal-to-Noise Ratio).
-    Reference: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
     Args:
-        img (ndarray): Images with range [0, 255].
-        img2 (ndarray): Images with range [0, 255].
-        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
-        input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'.
+        img1 (ndarray/tensor): Images with range [0, 255]/[0, 1].
+        img2 (ndarray/tensor): Images with range [0, 255]/[0, 1].
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the PSNR calculation.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
     Returns:
-        float: PSNR result.
+        float: psnr result.
     """
 
-    assert img.shape == img2.shape, (
-        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
     if input_order not in ['HWC', 'CHW']:
         raise ValueError(
-            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
-        )
-    img = reorder_image(img, input_order=input_order)
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+    if type(img1) == torch.Tensor:
+        if len(img1.shape) == 4:
+            img1 = img1.squeeze(0)
+        img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
+    if type(img2) == torch.Tensor:
+        if len(img2.shape) == 4:
+            img2 = img2.squeeze(0)
+        img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
+
+    img1 = reorder_image(img1, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
 
     if crop_border != 0:
-        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
         img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
 
-    img = img.astype(np.float64)
-    img2 = img2.astype(np.float64)
+    def _psnr(img1, img2):
+
+        mse = np.mean((img1 - img2)**2)
+        if mse == 0:
+            return float('inf')
+        max_value = 1. if img1.max() <= 1 else 255.
+        return 20. * np.log10(max_value / np.sqrt(mse))
 
-    mse = np.mean((img - img2)**2)
-    if mse == 0:
-        return float('inf')
-    return 10. * np.log10(255. * 255. / mse)
+    return _psnr(img1, img2)
 
 
-def calculate_ssim(img, img2, crop_border, input_order='HWC', **kwargs):
+def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True):
     """Calculate SSIM (structural similarity).
-    ``Paper: Image quality assessment: From error visibility to structural similarity``
+    Ref:
+    Image quality assessment: From error visibility to structural similarity
     The results are the same as that of the official released MATLAB code in
     https://ece.uwaterloo.ca/~z70wang/research/ssim/.
     For three-channel images, SSIM is calculated for each channel and then
     averaged.
     Args:
-        img (ndarray): Images with range [0, 255].
+        img1 (ndarray): Images with range [0, 255].
         img2 (ndarray): Images with range [0, 255].
-        crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation.
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the SSIM calculation.
         input_order (str): Whether the input order is 'HWC' or 'CHW'.
             Default: 'HWC'.
+        test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
     Returns:
-        float: SSIM result.
+        float: ssim result.
     """
 
-    assert img.shape == img2.shape, (
-        f'Image shapes are different: {img.shape}, {img2.shape}.')
+    assert img1.shape == img2.shape, (
+        f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
     if input_order not in ['HWC', 'CHW']:
         raise ValueError(
-            f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"'
-        )
-    img = reorder_image(img, input_order=input_order)
+            f'Wrong input_order {input_order}. Supported input_orders are '
+            '"HWC" and "CHW"')
+
+    if type(img1) == torch.Tensor:
+        if len(img1.shape) == 4:
+            img1 = img1.squeeze(0)
+        img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
+    if type(img2) == torch.Tensor:
+        if len(img2.shape) == 4:
+            img2 = img2.squeeze(0)
+        img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
+
+    img1 = reorder_image(img1, input_order=input_order)
     img2 = reorder_image(img2, input_order=input_order)
 
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
     if crop_border != 0:
-        img = img[crop_border:-crop_border, crop_border:-crop_border, ...]
+        img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
         img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
 
-    img = img.astype(np.float64)
-    img2 = img2.astype(np.float64)
+    def _cal_ssim(img1, img2):
+        ssims = []
+
+        max_value = 1 if img1.max() <= 1 else 255
+        with torch.no_grad():
+            final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim(
+                img1, img2, max_value)
+            ssims.append(final_ssim)
 
-    ssims = []
-    for i in range(img.shape[2]):
-        ssims.append(_ssim(img[..., i], img2[..., i]))
-    return np.array(ssims).mean()
+        return np.array(ssims).mean()
 
+    return _cal_ssim(img1, img2)
 
-def _ssim(img, img2):
+
+def _ssim(img, img2, max_value):
     """Calculate SSIM (structural similarity) for one channel images.
     It is called by func:`calculate_ssim`.
     Args:
@@ -152,8 +190,11 @@ def _ssim(img, img2):
         float: SSIM result.
     """
 
-    c1 = (0.01 * 255)**2
-    c2 = (0.03 * 255)**2
+    c1 = (0.01 * max_value)**2
+    c2 = (0.03 * max_value)**2
+
+    img = img.astype(np.float64)
+    img2 = img2.astype(np.float64)
     kernel = cv2.getGaussianKernel(11, 1.5)
     window = np.outer(kernel, kernel.transpose())
 
@@ -171,3 +212,61 @@ def _ssim(img, img2):
     tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
     ssim_map = tmp1 / tmp2
     return ssim_map.mean()
+
+
+def _3d_gaussian_calculator(img, conv3d):
+    out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
+    return out
+
+
+def _generate_3d_gaussian_kernel():
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+    kernel_3 = cv2.getGaussianKernel(11, 1.5)
+    kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0))
+    conv3d = torch.nn.Conv3d(
+        1,
+        1, (11, 11, 11),
+        stride=1,
+        padding=(5, 5, 5),
+        bias=False,
+        padding_mode='replicate')
+    conv3d.weight.requires_grad = False
+    conv3d.weight[0, 0, :, :, :] = kernel
+    return conv3d
+
+
+def _ssim_3d(img1, img2, max_value):
+    assert len(img1.shape) == 3 and len(img2.shape) == 3
+    """Calculate SSIM (structural similarity) for one channel images.
+    It is called by func:`calculate_ssim`.
+    Args:
+        img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
+        img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
+    Returns:
+        float: ssim result.
+    """
+    C1 = (0.01 * max_value)**2
+    C2 = (0.03 * max_value)**2
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+
+    kernel = _generate_3d_gaussian_kernel().cuda()
+
+    img1 = torch.tensor(img1).float().cuda()
+    img2 = torch.tensor(img2).float().cuda()
+
+    mu1 = _3d_gaussian_calculator(img1, kernel)
+    mu2 = _3d_gaussian_calculator(img2, kernel)
+
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq
+    sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq
+    sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2
+
+    tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2)
+    tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
+    ssim_map = tmp1 / tmp2
+    return float(ssim_map.mean())
diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
index a6fbf22f..4e8fc0ed 100644
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -3,7 +3,6 @@ import os
 from copy import deepcopy
 from typing import Any, Dict, Union
 
-import numpy as np
 import torch.cuda
 from torch.nn.parallel import DataParallel, DistributedDataParallel
 
@@ -78,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel):
     def _evaluate_postprocess(self, input: Tensor,
                               target: Tensor) -> Dict[str, list]:
         preds = self.model(input)
-        preds = list(torch.split(preds, 1, 0))
-        targets = list(torch.split(target, 1, 0))
-
-        preds = [(pred.data * 255.).squeeze(0).permute(
-            1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds]
-        targets = [(target.data * 255.).squeeze(0).permute(
-            1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets]
+        preds = list(torch.split(preds.clamp(0, 1), 1, 0))
+        targets = list(torch.split(target.clamp(0, 1), 1, 0))
 
         return {'pred': preds, 'target': targets}
 
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 35c060f0..7c31969a 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -26,6 +26,7 @@ else:
         'video_summarization_dataset': ['VideoSummarizationDataset'],
         'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
         'image_inpainting': ['ImageInpaintingDataset'],
+        'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
     }
     import sys
 

From 275f8b432328cfe9df38a105e616233c63efb6a1 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Fri, 14 Oct 2022 13:55:09 +0800
Subject: [PATCH 042/129] Revert "[to #45071449] fix setup error "

This reverts commit a26e6e38697a8795b99de4c7929b415baef78268.
---
 modelscope/models/audio/tts/models/datasets/__init__.py | 0
 requirements/framework.txt                              | 1 -
 2 files changed, 1 deletion(-)
 mode change 100755 => 100644 modelscope/models/audio/tts/models/datasets/__init__.py

diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py
old mode 100755
new mode 100644
diff --git a/requirements/framework.txt b/requirements/framework.txt
index aae200da..b51faeda 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -15,7 +15,6 @@ pyyaml
 requests
 scipy
 setuptools
-setuptools_scm
 tensorboard
 tqdm>=4.64.0
 yapf

From 155856301f0e4f61be0d4753734f1496e7cbf7ce Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Fri, 14 Oct 2022 14:00:57 +0800
Subject: [PATCH 043/129] [to #42322933] do not check training config in
 pipeline()         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10407849

---
 modelscope/utils/config.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index c4fa3c1b..e46da7df 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -609,11 +609,12 @@ class Config:
         return parse_fn(args)
 
 
-def check_config(cfg: Union[str, ConfigDict]):
+def check_config(cfg: Union[str, ConfigDict], is_training=False):
     """ Check whether configuration file is valid, If anything wrong, exception will be raised.
 
     Args:
         cfg (str or ConfigDict): Config file path or config object.
+        is_training: indicate if checking training related elements
     """
 
     if isinstance(cfg, str):
@@ -627,8 +628,9 @@ def check_config(cfg: Union[str, ConfigDict]):
     check_attr(ConfigFields.task)
     check_attr(ConfigFields.pipeline)
 
-    if hasattr(cfg, ConfigFields.train):
+    if is_training:
         check_attr(ConfigFields.model)
+        check_attr(ConfigFields.train)
         check_attr(ConfigFields.preprocessor)
         check_attr(ConfigFields.evaluation)
 

From 355da866c553216a2b45b5f1ae68a27eebcf62ec Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Fri, 14 Oct 2022 18:07:29 +0800
Subject: [PATCH 044/129] [to #42322933] limit tranformers version temporarily

---
 requirements/nlp.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index f18dde2e..2e0838fc 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -3,7 +3,7 @@ fasttext
 jieba>=0.42.1
 megatron_util
 pai-easynlp
-# “protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.”
+# protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>=3.19.0,<3.21.0
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
@@ -14,4 +14,5 @@ spacy>=2.3.5
 subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
-transformers>=4.12.0
+# recent 4.23.1 update introduce breaking api change, limit upper version temporarily.
+transformers>=4.12.0,<=4.22.0

From 876058556deabcdf1a399e79983444d97ec790f2 Mon Sep 17 00:00:00 2001
From: hemu <hemu.zp@alibaba-inc.com>
Date: Fri, 14 Oct 2022 18:15:52 +0800
Subject: [PATCH 045/129] fix generate

---
 modelscope/models/nlp/gpt3/modeling_gpt3.py | 3 +++
 requirements/nlp.txt                        | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index 498d15de..ade36e36 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -346,3 +346,6 @@ class GPT3Model(PreTrainedModel):
         }
         model.load_state_dict(state_dict)
         return model
+
+    def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
+        return {'input_ids': input_ids}
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 2e0838fc..123c238e 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -14,5 +14,4 @@ spacy>=2.3.5
 subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
-# recent 4.23.1 update introduce breaking api change, limit upper version temporarily.
-transformers>=4.12.0,<=4.22.0
+transformers

From 1b4d5ccb9c8b7a7d93c91aa85e43b017826df2c0 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Fri, 14 Oct 2022 18:32:38 +0800
Subject: [PATCH 046/129] [to #42322933]MsDataset upload and load supports
 directory.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

上传和下载支持多文件操作
---
 modelscope/hub/api.py                         | 34 ++++---
 modelscope/hub/utils/utils.py                 |  8 +-
 modelscope/msdatasets/ms_dataset.py           | 52 +++++++----
 modelscope/msdatasets/utils/dataset_utils.py  | 90 ++++++++++++++++++-
 modelscope/msdatasets/utils/download_utils.py | 18 ++--
 modelscope/msdatasets/utils/oss_utils.py      |  9 +-
 modelscope/msdatasets/utils/upload_utils.py   | 40 ++++++++-
 modelscope/utils/constant.py                  |  7 ++
 tests/msdatasets/test_dataset_upload.py       | 43 ++++++++-
 9 files changed, 250 insertions(+), 51 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 214045dd..dc4d0ab2 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -26,18 +26,15 @@ from modelscope.utils.logger import get_logger
 from .errors import (InvalidParameter, NotExistError, RequestError,
                      datahub_raise_on_error, handle_http_post_error,
                      handle_http_response, is_ok, raise_on_error)
-from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
-                          model_id_to_group_owner_name)
+from .utils.utils import get_endpoint, model_id_to_group_owner_name
 
 logger = get_logger()
 
 
 class HubApi:
 
-    def __init__(self, endpoint=None, dataset_endpoint=None):
+    def __init__(self, endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
-        self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
-        )
 
     def login(
         self,
@@ -288,7 +285,7 @@ class HubApi:
         return files
 
     def list_datasets(self):
-        path = f'{self.dataset_endpoint}/api/v1/datasets'
+        path = f'{self.endpoint}/api/v1/datasets'
         headers = None
         params = {}
         r = requests.get(path, params=params, headers=headers)
@@ -315,13 +312,13 @@ class HubApi:
                 cache_dir):
             shutil.rmtree(cache_dir)
         os.makedirs(cache_dir, exist_ok=True)
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
         r = requests.get(datahub_url)
         resp = r.json()
         datahub_raise_on_error(datahub_url, resp)
         dataset_id = resp['Data']['Id']
         dataset_type = resp['Data']['Type']
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
         r = requests.get(datahub_url)
         resp = r.json()
         datahub_raise_on_error(datahub_url, resp)
@@ -339,7 +336,7 @@ class HubApi:
             file_path = file_info['Path']
             extension = os.path.splitext(file_path)[-1]
             if extension in dataset_meta_format:
-                datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
+                datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                               f'Revision={revision}&FilePath={file_path}'
                 r = requests.get(datahub_url)
                 r.raise_for_status()
@@ -363,7 +360,7 @@ class HubApi:
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
         if file_name.endswith('.csv'):
-            file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
+            file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                         f'Revision={revision}&FilePath={file_name}'
         return file_name
 
@@ -372,7 +369,7 @@ class HubApi:
             dataset_name: str,
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                       f'ststoken?Revision={revision}'
         return self.datahub_remote_call(datahub_url)
 
@@ -383,7 +380,7 @@ class HubApi:
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
 
-        datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                       f'ststoken?Revision={revision}'
 
         cookies = requests.utils.dict_from_cookiejar(cookies)
@@ -392,6 +389,19 @@ class HubApi:
         raise_on_error(resp)
         return resp['Data']
 
+    def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
+                                 is_recursive, is_filter_dir, revision,
+                                 cookies):
+        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
+            f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
+        cookies = requests.utils.dict_from_cookiejar(cookies)
+
+        resp = requests.get(url=url, cookies=cookies)
+        resp = resp.json()
+        raise_on_error(resp)
+        resp = resp['Data']
+        return resp
+
     def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
         r = requests.post(url)
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index d84b78ea..7d3c2499 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -4,8 +4,7 @@ import hashlib
 import os
 from typing import Optional
 
-from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
-                                      DEFAULT_MODELSCOPE_DOMAIN,
+from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
                                       MODEL_ID_SEPARATOR,
                                       MODELSCOPE_URL_SCHEME)
@@ -44,11 +43,6 @@ def get_endpoint():
     return MODELSCOPE_URL_SCHEME + modelscope_domain
 
 
-def get_dataset_hub_endpoint():
-    return os.environ.get('HUB_DATASET_ENDPOINT',
-                          DEFAULT_MODELSCOPE_DATA_ENDPOINT)
-
-
 def compute_hash(file_path):
     BUFFER_SIZE = 1024 * 64  # 64k buffer size
     sha256_hash = hashlib.sha256()
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 361b8ae0..cf055d6d 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import math
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
@@ -17,19 +16,18 @@ from datasets.utils.file_utils import (is_relative_path,
                                        relative_to_absolute_path)
 
 from modelscope.hub.repository import DatasetRepository
+from modelscope.msdatasets.task_datasets.builder import build_task_dataset
+from modelscope.msdatasets.utils.dataset_builder import ExternalDataset
+from modelscope.msdatasets.utils.dataset_utils import (
+    get_dataset_files, get_target_dataset_structure, load_dataset_builder)
+from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager
+from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.config_ds import MS_DATASETS_CACHE
 from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
                                        DEFAULT_DATASET_REVISION,
                                        DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
-from .task_datasets.builder import build_task_dataset
-from .utils.dataset_builder import ExternalDataset
-from .utils.dataset_utils import (get_dataset_files,
-                                  get_target_dataset_structure,
-                                  load_dataset_builder)
-from .utils.download_utils import DatasetDownloadManager
-from .utils.upload_utils import DatasetUploadManager
 
 logger = get_logger()
 
@@ -234,7 +232,6 @@ class MsDataset:
                 # dataset organized to be compatible with hf format
                 if dataset_formation == DatasetFormations.hf_compatible:
                     dataset_name = dataset_scripts['.py'][0]
-                    download_dataset = dataset_name
             else:
                 raise FileNotFoundError(
                     f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
@@ -270,7 +267,8 @@ class MsDataset:
             raise TypeError('path must be a str or a list, but got'
                             f' {type(dataset_name)}')
 
-        if download_dataset:
+        is_ci_test = os.getenv('CI_TEST') == 'True'
+        if download_dataset and not is_ci_test:
             try:
                 api.on_dataset_download(
                     dataset_name=download_dataset, namespace=namespace)
@@ -570,15 +568,26 @@ class MsDataset:
                local_file_path: str,
                dataset_name: str,
                namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE,
-               version: Optional[str] = DEFAULT_DATASET_REVISION) -> None:
-        """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first.
+               version: Optional[str] = DEFAULT_DATASET_REVISION,
+               num_processes: Optional[int] = None,
+               chunksize: Optional[int] = 1,
+               filter_hidden_files: Optional[bool] = True) -> None:
+        """Upload dataset file or directory to the ModelScope Hub. Please login to the ModelScope Hub first.
 
         Args:
-            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip
-            local_file_path (str): Local file to upload
+            object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name
+            local_file_path (str): Local file or directory to upload
             dataset_name (str): Name of the dataset
             namespace(str, optional): Namespace of the dataset
             version: Optional[str]: Version of the dataset
+            num_processes: Optional[int]: The number of processes used for multi-process uploading.
+                This is only applicable when local_file_path is a directory, and we are uploading mutliple-files
+                insided the directory. When None provided, the number returned by os.cpu_count() is used as default.
+            chunksize: Optional[int]: The chunksize of objects to upload.
+                For very long iterables using a large value for chunksize can make the job complete much faster than
+                using the default value of 1. Available if local_file_path is a directory.
+            filter_hidden_files: Optional[bool]: Whether to filter hidden files.
+                Available if local_file_path is a directory.
 
         Returns:
             None
@@ -586,7 +595,20 @@ class MsDataset:
         """
         _upload_manager = DatasetUploadManager(
             dataset_name=dataset_name, namespace=namespace, version=version)
-        _upload_manager.upload(object_name, local_file_path)
+
+        if os.path.isfile(local_file_path):
+            _upload_manager.upload(
+                object_name=object_name, local_file_path=local_file_path)
+        elif os.path.isdir(local_file_path):
+            _upload_manager.upload_dir(
+                object_dir_name=object_name,
+                local_dir_path=local_file_path,
+                num_processes=num_processes,
+                chunksize=chunksize,
+                filter_hidden_files=filter_hidden_files)
+        else:
+            raise ValueError(
+                f'{local_file_path} is not a valid file path or directory')
 
     @staticmethod
     def clone_meta(dataset_work_dir: str,
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index ef42f75f..db9d1fee 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -6,7 +6,8 @@ from typing import Any, Mapping, Optional, Sequence, Union
 
 from datasets.builder import DatasetBuilder
 
-from modelscope.utils.constant import DEFAULT_DATASET_REVISION
+from modelscope.hub.api import HubApi
+from modelscope.utils.constant import DEFAULT_DATASET_REVISION, DownloadParams
 from modelscope.utils.logger import get_logger
 from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 
@@ -77,6 +78,81 @@ def get_target_dataset_structure(dataset_structure: dict,
     return target_subset_name, target_dataset_structure
 
 
+def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool,
+                         dataset_name: str, namespace: str,
+                         version: str) -> list:
+    """
+    List all of objects for specific dataset.
+
+    Args:
+        hub_api (class HubApi): HubApi instance.
+        max_limit (int): Max number of objects.
+        is_recursive (bool): Whether to list objects recursively.
+        dataset_name (str): Dataset name.
+        namespace (str): Namespace.
+        version (str): Dataset version.
+    Returns:
+        res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...]
+    """
+    res = []
+    cookies = hub_api.check_cookies_upload_data(use_cookies=True)
+    objects = hub_api.list_oss_dataset_objects(
+        dataset_name=dataset_name,
+        namespace=namespace,
+        max_limit=max_limit,
+        is_recursive=is_recursive,
+        is_filter_dir=True,
+        revision=version,
+        cookies=cookies)
+
+    for item in objects:
+        object_key = item.get('Key')
+        res.append(object_key)
+
+    return res
+
+
+def contains_dir(file_map) -> bool:
+    """
+    To check whether input contains at least one directory.
+
+    Args:
+        file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'}
+    Returns:
+        True if input contains at least one directory, False otherwise.
+    """
+    res = False
+    for k, v in file_map.items():
+        if isinstance(v, str) and not v.endswith('.zip'):
+            res = True
+            break
+    return res
+
+
+def get_split_objects_map(file_map, objects):
+    """
+    Get the map between dataset split and oss objects.
+
+    Args:
+        file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val
+            are dirs.
+        objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png']
+    Returns:
+        A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'],
+            'validation':['val/003/3_38.png']}
+    """
+    res = {}
+    for k, v in file_map.items():
+        res[k] = []
+
+    for obj_key in objects:
+        for k, v in file_map.items():
+            if obj_key.startswith(v):
+                res[k].append(obj_key)
+
+    return res
+
+
 def get_dataset_files(subset_split_into: dict,
                       dataset_name: str,
                       namespace: str,
@@ -95,14 +171,24 @@ def get_dataset_files(subset_split_into: dict,
     meta_map = defaultdict(dict)
     file_map = defaultdict(dict)
     args_map = defaultdict(dict)
-    from modelscope.hub.api import HubApi
     modelscope_api = HubApi()
+    objects = list_dataset_objects(
+        hub_api=modelscope_api,
+        max_limit=DownloadParams.MAX_LIST_OBJECTS_NUM.value,
+        is_recursive=True,
+        dataset_name=dataset_name,
+        namespace=namespace,
+        version=revision)
+
     for split, info in subset_split_into.items():
         meta_map[split] = modelscope_api.get_dataset_file_url(
             info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
         args_map[split] = info.get('args')
+
+    if contains_dir(file_map):
+        file_map = get_split_objects_map(file_map, objects)
     return meta_map, file_map, args_map
 
 
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
index 2e21bf50..b1c7a5ab 100644
--- a/modelscope/msdatasets/utils/download_utils.py
+++ b/modelscope/msdatasets/utils/download_utils.py
@@ -10,16 +10,14 @@ from .oss_utils import OssUtilities
 
 class DatasetDownloadManager(DownloadManager):
 
-    def __init__(
-        self,
-        dataset_name: str,
-        namespace: str,
-        version: str,
-        data_dir: Optional[str] = None,
-        download_config: Optional[DownloadConfig] = None,
-        base_path: Optional[str] = None,
-        record_checksums=True,
-    ):
+    def __init__(self,
+                 dataset_name: str,
+                 namespace: str,
+                 version: str,
+                 data_dir: Optional[str] = None,
+                 download_config: Optional[DownloadConfig] = None,
+                 base_path: Optional[str] = None,
+                 record_checksums=True):
         super().__init__(dataset_name, data_dir, download_config, base_path,
                          record_checksums)
         self._namespace = namespace
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index 4a403876..d7d61e89 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -50,11 +50,16 @@ class OssUtilities:
                 progress_callback=self._percentage)
         return local_path
 
-    def upload(self, oss_object_name: str, local_file_path: str) -> str:
+    def upload(self, oss_object_name: str, local_file_path: str,
+               indicate_individual_progress: bool) -> str:
         retry_count = 0
         object_key = os.path.join(self.oss_dir, oss_object_name)
         resumable_store = oss2.ResumableStore(
             root=self.upload_resumable_tmp_store)
+        if indicate_individual_progress:
+            progress_callback = self._percentage
+        else:
+            progress_callback = None
 
         while True:
             try:
@@ -66,7 +71,7 @@ class OssUtilities:
                     store=resumable_store,
                     multipart_threshold=self.upload_multipart_threshold,
                     part_size=self.upload_part_size,
-                    progress_callback=self._percentage,
+                    progress_callback=progress_callback,
                     num_threads=self.upload_num_threads)
                 break
             except Exception:
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index 4813b89f..2b4422b2 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -1,5 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
+from multiprocessing.dummy import Pool as ThreadPool
+
+from tqdm import tqdm
+
 from .oss_utils import OssUtilities
 
 
@@ -19,5 +24,38 @@ class DatasetUploadManager(object):
 
     def upload(self, object_name: str, local_file_path: str) -> str:
         object_key = self.oss_utilities.upload(
-            oss_object_name=object_name, local_file_path=local_file_path)
+            oss_object_name=object_name,
+            local_file_path=local_file_path,
+            indicate_individual_progress=True)
         return object_key
+
+    def upload_dir(self, object_dir_name: str, local_dir_path: str,
+                   num_processes: int, chunksize: int,
+                   filter_hidden_files: bool) -> int:
+
+        def run_upload(args):
+            self.oss_utilities.upload(
+                oss_object_name=args[0],
+                local_file_path=args[1],
+                indicate_individual_progress=False)
+
+        files_list = []
+        for root, dirs, files in os.walk(local_dir_path):
+            for file_name in files:
+                if filter_hidden_files and file_name.startswith('.'):
+                    continue
+                # Concatenate directory name and relative path into a oss object key. e.g., train/001/1_1230.png
+                object_name = os.path.join(
+                    object_dir_name,
+                    root.replace(local_dir_path, '', 1).strip('/'), file_name)
+
+                local_file_path = os.path.join(root, file_name)
+                files_list.append((object_name, local_file_path))
+
+        with ThreadPool(processes=num_processes) as pool:
+            result = list(
+                tqdm(
+                    pool.imap(run_upload, files_list, chunksize=chunksize),
+                    total=len(files_list)))
+
+        return len(result)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 5f0532ce..9e10e802 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -227,6 +227,13 @@ class DownloadMode(enum.Enum):
     FORCE_REDOWNLOAD = 'force_redownload'
 
 
+class DownloadParams(enum.Enum):
+    """
+        Parameters for downloading dataset.
+    """
+    MAX_LIST_OBJECTS_NUM = 50000
+
+
 class DatasetFormations(enum.Enum):
     """ How a dataset is organized and interpreted
     """
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index 1179414d..3d35d480 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -6,9 +6,13 @@ import unittest
 import zipfile
 
 from modelscope.msdatasets import MsDataset
-from modelscope.utils.constant import ModelFile
+from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile
 from modelscope.utils.test_utils import test_level
 
+logger = logging.get_logger(__name__)
+
 KEY_EXTRACTED = 'extracted'
 
 
@@ -39,7 +43,8 @@ class DatasetUploadTest(unittest.TestCase):
     def tearDown(self):
         os.chdir(self.old_dir)
         shutil.rmtree(self.temp_dir, ignore_errors=True)
-        print('The test dir successfully removed!')
+        logger.info(
+            f'Temporary directory {self.temp_dir} successfully removed!')
 
     @staticmethod
     def get_raw_downloaded_file_path(extracted_path):
@@ -68,6 +73,40 @@ class DatasetUploadTest(unittest.TestCase):
             dataset_name=self.dataset_name,
             namespace=self.namespace)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_upload_dir(self):
+        ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train')
+        config_train = ms_ds_train._hf_ds.config_kwargs
+        extracted_path_train = config_train.get('split_config').get('train')
+
+        MsDataset.upload(
+            object_name='train',
+            local_file_path=os.path.join(extracted_path_train,
+                                         'Pets/images/train'),
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+        MsDataset.upload(
+            object_name='val',
+            local_file_path=os.path.join(extracted_path_train,
+                                         'Pets/images/val'),
+            dataset_name=self.dataset_name,
+            namespace=self.namespace)
+
+        objects = list_dataset_objects(
+            hub_api=self.api,
+            max_limit=-1,
+            is_recursive=True,
+            dataset_name=self.dataset_name,
+            namespace=self.namespace,
+            version=DEFAULT_DATASET_REVISION)
+
+        logger.info(f'{len(objects)} objects have been uploaded: {objects}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ds_download_dir(self):
+        test_ds = MsDataset.load(self.dataset_name, self.namespace)
+        assert test_ds.config_kwargs['split_config'].values()
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_clone_meta(self):
         MsDataset.clone_meta(

From deb847614a518537a22567209519dbea89feabcd Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Fri, 14 Oct 2022 21:59:52 +0800
Subject: [PATCH 047/129] [to #42322933] limit espnet version

---
 requirements/audio.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/audio.txt b/requirements/audio.txt
index 742cf166..bef32121 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,5 +1,5 @@
 easyasr>=0.0.2
-espnet>=202204
+espnet==202204
 h5py
 inflect
 keras

From 202fcdf2984a214e8d4a55b11607eefafa77af0f Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Fri, 14 Oct 2022 23:11:19 +0800
Subject: [PATCH 048/129] [to #42322933] change tableqa output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修改output的结构，直接返回可转化成json format的结构
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10415403
---
 .../models/nlp/table_question_answering.py     |  6 +++---
 modelscope/outputs.py                          |  7 ++++++-
 .../nlp/table_question_answering_pipeline.py   |  3 ++-
 .../pipelines/test_table_question_answering.py | 18 +++++++++++-------
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
index c6a03ef3..c2134df2 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -691,11 +691,11 @@ class TableQuestionAnswering(Model):
                         sels.append(l_hs[ib] - 1)
                         aggs.append(sql['agg'][ia])
                     continue
-                sels.append(sel)
+                sels.append(int(sel))
                 if sql['agg'][ia] == -1:
                     aggs.append(0)
                 else:
-                    aggs.append(sql['agg'][ia])
+                    aggs.append(int(sql['agg'][ia]))
             if len(sels) == 0:
                 sels.append(l_hs[ib] - 1)
                 aggs.append(0)
@@ -712,7 +712,7 @@ class TableQuestionAnswering(Model):
             for i in range(wl):
                 if wc_os[i] == -1:
                     continue
-                conds.append([wc_os[i], wo_os[i], pr_wvi_str[ib][i]])
+                conds.append([int(wc_os[i]), int(wo_os[i]), pr_wvi_str[ib][i]])
             if len(conds) == 0:
                 conds.append([l_hs[ib] - 1, 2, 'Nulll'])
             sql['conds'] = conds
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 3001c03c..c08779b4 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -36,6 +36,8 @@ class OutputKeys(object):
     UUID = 'uuid'
     WORD = 'word'
     KWS_LIST = 'kws_list'
+    SQL_STRING = 'sql_string'
+    SQL_QUERY = 'sql_query'
     HISTORY = 'history'
     QUERT_RESULT = 'query_result'
     TIMESTAMPS = 'timestamps'
@@ -583,7 +585,10 @@ TASK_OUTPUTS = {
     #   "sql": "SELECT shop.Name FROM shop."
     #   "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
     # }
-    Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY],
+    Tasks.table_question_answering: [
+        OutputKeys.SQL_STRING, OutputKeys.SQL_QUERY, OutputKeys.HISTORY,
+        OutputKeys.QUERT_RESULT
+    ],
 
     # ============ audio tasks ===================
     # asr result for single sample
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index e1b2b07b..ca17c9b1 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -311,7 +311,8 @@ class TableQuestionAnsweringPipeline(Pipeline):
             tabledata = {'headers': [], 'cells': []}
 
         output = {
-            OutputKeys.OUTPUT: sql,
+            OutputKeys.SQL_STRING: sql.string,
+            OutputKeys.SQL_QUERY: sql.query,
             OutputKeys.HISTORY: result['sql'],
             OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False),
         }
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 68e0564f..3d943e51 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -3,10 +3,12 @@ import os
 import unittest
 from typing import List
 
+import json
 from transformers import BertTokenizer
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
@@ -38,11 +40,12 @@ def tableqa_tracking_and_print_results_with_history(
                 'history_sql': historical_queries
             })
             print('question', question)
-            print('sql text:', output_dict['output'].string)
-            print('sql query:', output_dict['output'].query)
-            print('query result:', output_dict['query_result'])
+            print('sql text:', output_dict[OutputKeys.SQL_STRING])
+            print('sql query:', output_dict[OutputKeys.SQL_QUERY])
+            print('query result:', output_dict[OutputKeys.QUERT_RESULT])
+            print('json dumps', json.dumps(output_dict))
             print()
-            historical_queries = output_dict['history']
+            historical_queries = output_dict[OutputKeys.HISTORY]
 
 
 def tableqa_tracking_and_print_results_without_history(
@@ -60,9 +63,10 @@ def tableqa_tracking_and_print_results_without_history(
         for question in test_case['utterance']:
             output_dict = p({'question': question})
             print('question', question)
-            print('sql text:', output_dict['output'].string)
-            print('sql query:', output_dict['output'].query)
-            print('query result:', output_dict['query_result'])
+            print('sql text:', output_dict[OutputKeys.SQL_STRING])
+            print('sql query:', output_dict[OutputKeys.SQL_QUERY])
+            print('query result:', output_dict[OutputKeys.QUERT_RESULT])
+            print('json dumps', json.dumps(output_dict))
             print()
 
 
From 7e7303a658fae50a027420547035bb7319c64c76 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Sat, 15 Oct 2022 08:51:06 +0800
Subject: [PATCH 049/129] [to #42322933] remove fasttext from nlp requirements

---
 modelscope/utils/error.py        | 9 +++++++++
 modelscope/utils/import_utils.py | 1 +
 requirements/nlp.txt             | 3 +--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index a6bbc8b3..a894063c 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -111,3 +111,12 @@ You can install it with pip on linux:
 On windows, please checkout the instructions on the
 installation page: https://github.com/facebookresearch/fairseq and follow the ones that match your environment.
 """
+
+# docstyle-ignore
+FASTTEXT_IMPORT_ERROR = """
+{0} requires the fasttext library but it was not found in your environment.
+You can install it with pip on linux or mac:
+`pip install fasttext`
+Or you can checkout the instructions on the
+installation page: https://github.com/facebookresearch/fastText and follow the ones that match your environment.
+"""
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index 2a6fdc80..5db5ea98 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -292,6 +292,7 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('decord', (is_package_available('decord'), DECORD_IMPORT_ERROR)),
     ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)),
     ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)),
+    ('fasttext', (is_package_available('fasttext'), FASTTEXT_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 123c238e..a5f3cbd9 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,5 +1,4 @@
 en_core_web_sm>=2.3.5
-fasttext
 jieba>=0.42.1
 megatron_util
 pai-easynlp
@@ -14,4 +13,4 @@ spacy>=2.3.5
 subword_nmt>=0.3.8
 text2sql_lgesql
 tokenizers
-transformers
+transformers>=4.12.0

From 4682783619f86726d0bb3c880c2100e91d126355 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Sat, 15 Oct 2022 20:33:55 +0800
Subject: [PATCH 050/129] [to #44902165] bump version to 0.5.0

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index 1e4826d6..2b8877c5 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.4.7'
+__version__ = '0.5.0'

From f6e542cdcb6c1a1be690750bebda791ed5c90589 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Mon, 17 Oct 2022 10:40:08 +0800
Subject: [PATCH 051/129] refine pipeline input to support demo service

* image_captioninig support single image and dict input
* image_style_transfer use dict input

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10417330
---
 modelscope/pipeline_inputs.py                | 16 ++++++++++------
 modelscope/pipelines/base.py                 |  6 +++++-
 tests/pipelines/test_image_style_transfer.py | 15 +++++++++------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index 2b14c278..34b731c6 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -97,8 +97,10 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.image_to_image_translation:
     InputType.IMAGE,
-    Tasks.image_style_transfer:
-    InputType.IMAGE,
+    Tasks.image_style_transfer: {
+        'content': InputType.IMAGE,
+        'style': InputType.IMAGE,
+    },
     Tasks.image_portrait_stylization:
     InputType.IMAGE,
     Tasks.live_category:
@@ -147,8 +149,9 @@ TASK_INPUTS = {
     InputType.TEXT,
     Tasks.translation:
     InputType.TEXT,
-    Tasks.word_segmentation:
-    InputType.TEXT,
+    Tasks.word_segmentation: [InputType.TEXT, {
+        'text': InputType.TEXT,
+    }],
     Tasks.part_of_speech:
     InputType.TEXT,
     Tasks.named_entity_recognition:
@@ -194,8 +197,9 @@ TASK_INPUTS = {
     InputType.AUDIO,
 
     # ============ multi-modal tasks ===================
-    Tasks.image_captioning:
-    InputType.IMAGE,
+    Tasks.image_captioning: [InputType.IMAGE, {
+        'image': InputType.IMAGE,
+    }],
     Tasks.visual_grounding: {
         'image': InputType.IMAGE,
         'text': InputType.TEXT
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 5732a9d7..ea329be4 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -236,7 +236,11 @@ class Pipeline(ABC):
             if isinstance(input_type, list):
                 matched_type = None
                 for t in input_type:
-                    if type(t) == type(input):
+                    if isinstance(input, (dict, tuple)):
+                        if type(t) == type(input):
+                            matched_type = t
+                            break
+                    elif isinstance(t, str):
                         matched_type = t
                         break
                 if matched_type is None:
diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py
index a02d5308..5f37f204 100644
--- a/tests/pipelines/test_image_style_transfer.py
+++ b/tests/pipelines/test_image_style_transfer.py
@@ -25,8 +25,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_style_transfer, model=snapshot_path)
 
         result = image_style_transfer(
-            'data/test/images/style_transfer_content.jpg',
-            style='data/test/images/style_transfer_style.jpg')
+            dict(
+                content='data/test/images/style_transfer_content.jpg',
+                style='data/test/images/style_transfer_style.jpg'))
         cv2.imwrite('result_styletransfer1.png', result[OutputKeys.OUTPUT_IMG])
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -35,8 +36,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_style_transfer, model=self.model_id)
 
         result = image_style_transfer(
-            'data/test/images/style_transfer_content.jpg',
-            style='data/test/images/style_transfer_style.jpg')
+            dict(
+                content='data/test/images/style_transfer_content.jpg',
+                style='data/test/images/style_transfer_style.jpg'))
         cv2.imwrite('result_styletransfer2.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub done')
 
@@ -45,8 +47,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck):
         image_style_transfer = pipeline(Tasks.image_style_transfer)
 
         result = image_style_transfer(
-            'data/test/images/style_transfer_content.jpg',
-            style='data/test/images/style_transfer_style.jpg')
+            dict(
+                content='data/test/images/style_transfer_content.jpg',
+                style='data/test/images/style_transfer_style.jpg'))
         cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG])
         print('style_transfer.test_run_modelhub_default_model done')
 

From 88a7599efb0168cd1914e19d0990ab1b37fc7406 Mon Sep 17 00:00:00 2001
From: "wenqi.oywq" <wenqi.oywq@alibaba-inc.com>
Date: Mon, 17 Oct 2022 14:05:12 +0800
Subject: [PATCH 052/129] [to #42322933]change output channels from RGB to BGR,
 to consistent with demo-service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

默认输出为array的，通道格式统一为BGR格式，本次修改是为了与这个格式一致
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10422508
---
 modelscope/pipelines/cv/image_color_enhance_pipeline.py | 2 +-
 tests/pipelines/test_image_color_enhance.py             | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/modelscope/pipelines/cv/image_color_enhance_pipeline.py b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
index d21d879c..3a4cf8bc 100644
--- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py
+++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py
@@ -55,5 +55,5 @@ class ImageColorEnhancePipeline(Pipeline):
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         output_img = (inputs['outputs'].squeeze(0) * 255.).type(
-            torch.uint8).cpu().permute(1, 2, 0).numpy()
+            torch.uint8).cpu().permute(1, 2, 0).numpy()[:, :, ::-1]
         return {OutputKeys.OUTPUT_IMG: output_img}
diff --git a/tests/pipelines/test_image_color_enhance.py b/tests/pipelines/test_image_color_enhance.py
index 9b72999e..7c3ae8c0 100644
--- a/tests/pipelines/test_image_color_enhance.py
+++ b/tests/pipelines/test_image_color_enhance.py
@@ -21,8 +21,7 @@ class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck):
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
         result = pipeline(input_location)
         if result is not None:
-            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG][:, :,
-                                                                    [2, 1, 0]])
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
             print(f'Output written to {osp.abspath("result.png")}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')

From 674e1a7878f63603aa3bbc669fbac6a8b8a5b8a5 Mon Sep 17 00:00:00 2001
From: "wendi.hwd" <wendi.hwd@alibaba-inc.com>
Date: Mon, 17 Oct 2022 14:06:07 +0800
Subject: [PATCH 053/129] [to #42322933]cv/cvdet_fix_outputs->master fix
 outputs         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10421413

    * fix outputs
---
 .../pipelines/cv/image_detection_pipeline.py  |  8 ++++++--
 tests/pipelines/test_object_detection.py      | 20 ++++---------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/modelscope/pipelines/cv/image_detection_pipeline.py b/modelscope/pipelines/cv/image_detection_pipeline.py
index f5554ca2..08633c35 100644
--- a/modelscope/pipelines/cv/image_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_detection_pipeline.py
@@ -43,11 +43,15 @@ class ImageDetectionPipeline(Pipeline):
 
         bboxes, scores, labels = self.model.postprocess(inputs['data'])
         if bboxes is None:
-            return None
+            outputs = {
+                OutputKeys.SCORES: [],
+                OutputKeys.LABELS: [],
+                OutputKeys.BOXES: []
+            }
+            return outputs
         outputs = {
             OutputKeys.SCORES: scores,
             OutputKeys.LABELS: labels,
             OutputKeys.BOXES: bboxes
         }
-
         return outputs
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 2cb217d9..00a71371 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -19,20 +19,14 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         model_id = 'damo/cv_vit_object-detection_coco'
         object_detect = pipeline(Tasks.image_object_detection, model=model_id)
         result = object_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_object_detection_with_default_task(self):
         input_location = 'data/test/images/image_detection.jpg'
         object_detect = pipeline(Tasks.image_object_detection)
         result = object_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_human_detection(self):
@@ -40,20 +34,14 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         model_id = 'damo/cv_resnet18_human-detection'
         human_detect = pipeline(Tasks.human_detection, model=model_id)
         result = human_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_human_detection_with_default_task(self):
         input_location = 'data/test/images/image_detection.jpg'
         human_detect = pipeline(Tasks.human_detection)
         result = human_detect(input_location)
-        if result:
-            print(result)
-        else:
-            raise ValueError('process error')
+        print(result)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From 542c4ce1b3433ca1d51ac0c0349b3d8f87c51f41 Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Mon, 17 Oct 2022 14:07:05 +0800
Subject: [PATCH 054/129] [to #42322933] Fix bug in KWS when setting customized
 keyword         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10412829

---
 .../audio/asr/generic_automatic_speech_recognition.py    | 2 ++
 modelscope/models/audio/kws/generic_key_word_spotting.py | 2 ++
 modelscope/pipelines/audio/kws_kwsbp_pipeline.py         | 9 +++++++++
 modelscope/preprocessors/asr.py                          | 2 ++
 modelscope/preprocessors/kws.py                          | 2 ++
 tests/pipelines/test_key_word_spotting.py                | 2 +-
 6 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
index 11accf0a..aebc6751 100644
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/models/audio/kws/generic_key_word_spotting.py b/modelscope/models/audio/kws/generic_key_word_spotting.py
index c1b7a0e4..2f70327d 100644
--- a/modelscope/models/audio/kws/generic_key_word_spotting.py
+++ b/modelscope/models/audio/kws/generic_key_word_spotting.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict
 
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 450a12bb..5555c9e6 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -37,6 +37,12 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
                  **kwargs) -> Dict[str, Any]:
         if 'keywords' in kwargs.keys():
             self.keywords = kwargs['keywords']
+            if isinstance(self.keywords, str):
+                word_list = []
+                word = {}
+                word['keyword'] = self.keywords
+                word_list.append(word)
+                self.keywords = word_list
         else:
             self.keywords = None
 
@@ -96,6 +102,9 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
             pos_list=pos_kws_list,
             neg_list=neg_kws_list)
 
+        if 'kws_list' not in rst_dict:
+            rst_dict['kws_list'] = []
+
         return rst_dict
 
     def run_with_kwsbp(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py
index d58383d7..facaa132 100644
--- a/modelscope/preprocessors/asr.py
+++ b/modelscope/preprocessors/asr.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, List, Union
 
diff --git a/modelscope/preprocessors/kws.py b/modelscope/preprocessors/kws.py
index 9c370ed5..6f09d545 100644
--- a/modelscope/preprocessors/kws.py
+++ b/modelscope/preprocessors/kws.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 from typing import Any, Dict, List, Union
 
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 91f9f566..f31d212b 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -245,7 +245,7 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_by_customized_keywords(self):
-        keywords = [{'keyword': '播放音乐'}]
+        keywords = '播放音乐'
 
         kws_result = self.run_pipeline(
             model_id=self.model_id,

From 8fa385e27cc9a949c8544e838a6250ab527b0685 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Mon, 17 Oct 2022 15:42:24 +0800
Subject: [PATCH 055/129] [to #42322933] Add upload in hub api

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10386689
---
 modelscope/hub/git.py        |  10 +++
 modelscope/hub/upload.py     | 117 +++++++++++++++++++++++++
 tests/hub/test_hub_upload.py | 164 +++++++++++++++++++++++++++++++++++
 3 files changed, 291 insertions(+)
 create mode 100644 modelscope/hub/upload.py
 create mode 100644 tests/hub/test_hub_upload.py

diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index a149ede1..db76506e 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import re
 import subprocess
 from typing import List
 from xmlrpc.client import Boolean
@@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton):
         cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision]
         return self._run_git_command(*cmds)
 
+    def get_remote_branches(self, repo_dir: str):
+        cmds = ['-C', '%s' % repo_dir, 'branch', '-r']
+        rsp = self._run_git_command(*cmds)
+        info = [
+            line.strip()
+            for line in rsp.stdout.decode('utf8').strip().split(os.linesep)
+        ][1:]
+        return ['/'.join(line.split('/')[1:]) for line in info]
+
     def pull(self, repo_dir: str):
         cmds = ['-C', repo_dir, 'pull']
         return self._run_git_command(*cmds)
diff --git a/modelscope/hub/upload.py b/modelscope/hub/upload.py
new file mode 100644
index 00000000..9dffc60e
--- /dev/null
+++ b/modelscope/hub/upload.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import datetime
+import os
+import shutil
+import tempfile
+import uuid
+from typing import Dict, Optional
+from uuid import uuid4
+
+from filelock import FileLock
+
+from modelscope import __version__
+from modelscope.hub.api import HubApi, ModelScopeConfig
+from modelscope.hub.errors import InvalidParameter, NotLoginException
+from modelscope.hub.git import GitCommandWrapper
+from modelscope.hub.repository import Repository
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def upload_folder(model_id: str,
+                  model_dir: str,
+                  visibility: int = 0,
+                  license: str = None,
+                  chinese_name: Optional[str] = None,
+                  commit_message: Optional[str] = None,
+                  revision: Optional[str] = DEFAULT_MODEL_REVISION):
+    """
+    Upload model from a given directory to given repository. A valid model directory
+    must contain a configuration.json file.
+
+    This function upload the files in given directory to given repository. If the
+    given repository is not exists in remote, it will automatically create it with
+    given visibility, license and chinese_name parameters. If the revision is also
+    not exists in remote repository, it will create a new branch for it.
+
+    This function must be called before calling HubApi's login with a valid token
+    which can be obtained from ModelScope's website.
+
+    Args:
+        model_id (`str`):
+            The model id to be uploaded, caller must have write permission for it.
+        model_dir(`str`):
+            The Absolute Path of the finetune result.
+        visibility(`int`, defaults to `0`):
+            Visibility of the new created model(1-private, 5-public). If the model is
+            not exists in ModelScope, this function will create a new model with this
+            visibility and this parameter is required. You can ignore this parameter
+            if you make sure the model's existence.
+        license(`str`, defaults to `None`):
+            License of the new created model(see License). If the model is not exists
+            in ModelScope, this function will create a new model with this license
+            and this parameter is required. You can ignore this parameter if you
+            make sure the model's existence.
+        chinese_name(`str`, *optional*, defaults to `None`):
+            chinese name of the new created model.
+        commit_message(`str`, *optional*, defaults to `None`):
+            commit message of the push request.
+        revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
+            which branch to push. If the branch is not exists, It will create a new
+            branch and push to it.
+    """
+    if model_id is None:
+        raise InvalidParameter('model_id cannot be empty!')
+    if model_dir is None:
+        raise InvalidParameter('model_dir cannot be empty!')
+    if not os.path.exists(model_dir) or os.path.isfile(model_dir):
+        raise InvalidParameter('model_dir must be a valid directory.')
+    cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+    if not os.path.exists(cfg_file):
+        raise ValueError(f'{model_dir} must contain a configuration.json.')
+    cookies = ModelScopeConfig.get_cookies()
+    if cookies is None:
+        raise NotLoginException('Must login before upload!')
+    files_to_save = os.listdir(model_dir)
+    api = HubApi()
+    try:
+        api.get_model(model_id=model_id)
+    except Exception:
+        if visibility is None or license is None:
+            raise InvalidParameter(
+                'visibility and license cannot be empty if want to create new repo'
+            )
+        logger.info('Create new model %s' % model_id)
+        api.create_model(
+            model_id=model_id,
+            visibility=visibility,
+            license=license,
+            chinese_name=chinese_name)
+    tmp_dir = tempfile.mkdtemp()
+    git_wrapper = GitCommandWrapper()
+    try:
+        repo = Repository(model_dir=tmp_dir, clone_from=model_id)
+        branches = git_wrapper.get_remote_branches(tmp_dir)
+        if revision not in branches:
+            logger.info('Create new branch %s' % revision)
+            git_wrapper.new_branch(tmp_dir, revision)
+        git_wrapper.checkout(tmp_dir, revision)
+        for f in files_to_save:
+            if f[0] != '.':
+                src = os.path.join(model_dir, f)
+                if os.path.isdir(src):
+                    shutil.copytree(src, os.path.join(tmp_dir, f))
+                else:
+                    shutil.copy(src, tmp_dir)
+        if not commit_message:
+            date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
+            commit_message = '[automsg] push model %s to hub at %s' % (
+                model_id, date)
+        repo.push(commit_message=commit_message, branch=revision)
+    except Exception:
+        raise
+    finally:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
new file mode 100644
index 00000000..d7e6e439
--- /dev/null
+++ b/tests/hub/test_hub_upload.py
@@ -0,0 +1,164 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.repository import Repository
+from modelscope.hub.upload import upload_folder
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+from .test_utils import TEST_ACCESS_TOKEN1, delete_credential
+
+logger = get_logger()
+
+
+class HubUploadTest(unittest.TestCase):
+
+    def setUp(self):
+        logger.info('SetUp')
+        self.api = HubApi()
+        self.user = os.environ.get('TEST_MODEL_ORG', 'citest')
+        logger.info(self.user)
+        self.create_model_name = '%s/%s' % (self.user, 'test_model_upload')
+        temporary_dir = tempfile.mkdtemp()
+        self.work_dir = temporary_dir
+        self.model_dir = os.path.join(temporary_dir, self.create_model_name)
+        self.finetune_path = os.path.join(self.work_dir, 'finetune_path')
+        self.repo_path = os.path.join(self.work_dir, 'repo_path')
+        os.mkdir(self.finetune_path)
+        os.system("echo '{}'>%s"
+                  % os.path.join(self.finetune_path, ModelFile.CONFIGURATION))
+
+    def tearDown(self):
+        logger.info('TearDown')
+        shutil.rmtree(self.model_dir, ignore_errors=True)
+        self.api.delete_model(model_id=self.create_model_name)
+
+    def test_upload_exits_repo_master(self):
+        logger.info('basic test for upload!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        self.api.create_model(
+            model_id=self.create_model_name,
+            visibility=ModelVisibility.PUBLIC,
+            license=Licenses.APACHE_V2)
+        os.system("echo '111'>%s"
+                  % os.path.join(self.finetune_path, 'add1.py'))
+        upload_folder(
+            model_id=self.create_model_name, model_dir=self.finetune_path)
+        Repository(model_dir=self.repo_path, clone_from=self.create_model_name)
+        assert os.path.exists(os.path.join(self.repo_path, 'add1.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+        os.system("echo '222'>%s"
+                  % os.path.join(self.finetune_path, 'add2.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_revision/version1')
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_revision/version1')
+        assert os.path.exists(os.path.join(self.repo_path, 'add2.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+        os.system("echo '333'>%s"
+                  % os.path.join(self.finetune_path, 'add3.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_revision/version2',
+            commit_message='add add3.py')
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_revision/version2')
+        assert os.path.exists(os.path.join(self.repo_path, 'add2.py'))
+        assert os.path.exists(os.path.join(self.repo_path, 'add3.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+        add4_path = os.path.join(self.finetune_path, 'temp')
+        os.mkdir(add4_path)
+        os.system("echo '444'>%s" % os.path.join(add4_path, 'add4.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_revision/version1')
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_revision/version1')
+        assert os.path.exists(os.path.join(add4_path, 'add4.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_upload_non_exists_repo(self):
+        logger.info('test upload non exists repo!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        os.system("echo '111'>%s"
+                  % os.path.join(self.finetune_path, 'add1.py'))
+        upload_folder(
+            model_id=self.create_model_name,
+            model_dir=self.finetune_path,
+            revision='new_model_new_revision',
+            visibility=ModelVisibility.PUBLIC,
+            license=Licenses.APACHE_V2)
+        Repository(
+            model_dir=self.repo_path,
+            clone_from=self.create_model_name,
+            revision='new_model_new_revision')
+        assert os.path.exists(os.path.join(self.repo_path, 'add1.py'))
+        shutil.rmtree(self.repo_path, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_upload_without_token(self):
+        logger.info('test upload without login!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        delete_credential()
+        try:
+            upload_folder(
+                model_id=self.create_model_name,
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+        except Exception as e:
+            logger.info(e)
+            self.api.login(TEST_ACCESS_TOKEN1)
+            upload_folder(
+                model_id=self.create_model_name,
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+            Repository(
+                model_dir=self.repo_path, clone_from=self.create_model_name)
+            assert os.path.exists(
+                os.path.join(self.repo_path, 'configuration.json'))
+            shutil.rmtree(self.repo_path, ignore_errors=True)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_upload_invalid_repo(self):
+        logger.info('test upload to invalid repo!')
+        self.api.login(TEST_ACCESS_TOKEN1)
+        try:
+            upload_folder(
+                model_id='%s/%s' % ('speech_tts', 'invalid_model_test'),
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+        except Exception as e:
+            logger.info(e)
+            upload_folder(
+                model_id=self.create_model_name,
+                model_dir=self.finetune_path,
+                visibility=ModelVisibility.PUBLIC,
+                license=Licenses.APACHE_V2)
+            Repository(
+                model_dir=self.repo_path, clone_from=self.create_model_name)
+            assert os.path.exists(
+                os.path.join(self.repo_path, 'configuration.json'))
+            shutil.rmtree(self.repo_path, ignore_errors=True)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7720ae50e241ed3a5cf319d9410b774228d8126c Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Mon, 17 Oct 2022 20:30:42 +0800
Subject: [PATCH 056/129] return dict values when input single sample for
 easycv pipeline

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10423383
---
 .../pipelines/cv/easycv_pipelines/base.py      | 18 +++++++++++++++++-
 .../cv/easycv_pipelines/detection_pipeline.py  |  3 +++
 .../face_2d_keypoints_pipeline.py              |  3 +++
 .../human_wholebody_keypoint_pipeline.py       |  3 +++
 tests/pipelines/test_face_2d_keypoints.py      |  2 +-
 tests/pipelines/test_hand_2d_keypoints.py      |  9 ++-------
 .../pipelines/test_human_wholebody_keypoint.py |  2 +-
 tests/pipelines/test_object_detection.py       |  2 +-
 8 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py
index 8aea1146..c130aea0 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/base.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/base.py
@@ -4,7 +4,9 @@ import os
 import os.path as osp
 from typing import Any
 
+import numpy as np
 from easycv.utils.ms_utils import EasyCVMeta
+from PIL import ImageFile
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.pipelines.util import is_official_hub_path
@@ -94,5 +96,19 @@ class EasyCVPipeline(object):
 
         return easycv_config
 
+    def _is_single_inputs(self, inputs):
+        if isinstance(inputs, str) or (isinstance(inputs, list)
+                                       and len(inputs) == 1) or isinstance(
+                                           inputs, np.ndarray) or isinstance(
+                                               inputs, ImageFile.ImageFile):
+            return True
+
+        return False
+
     def __call__(self, inputs) -> Any:
-        return self.predict_op(inputs)
+        outputs = self.predict_op(inputs)
+
+        if self._is_single_inputs(inputs):
+            outputs = outputs[0]
+
+        return outputs
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
index 0c2058d5..a1173bc4 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -57,4 +57,7 @@ class EasyCVDetectionPipeline(EasyCVPipeline):
             OutputKeys.BOXES: boxes
         } for output in outputs]
 
+        if self._is_single_inputs(inputs):
+            results = results[0]
+
         return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
index 7c32e0fc..b48d013e 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py
@@ -40,4 +40,7 @@ class Face2DKeypointsPipeline(EasyCVPipeline):
             OutputKeys.POSES: output['pose']
         } for output in outputs]
 
+        if self._is_single_inputs(inputs):
+            results = results[0]
+
         return results
diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
index 263f8225..936accbf 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py
@@ -62,4 +62,7 @@ class HumanWholebodyKeypointsPipeline(EasyCVPipeline):
             OutputKeys.BOXES: output['boxes']
         } for output in outputs]
 
+        if self._is_single_inputs(inputs):
+            results = results[0]
+
         return results
diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py
index 667ecddc..a5e347e8 100644
--- a/tests/pipelines/test_face_2d_keypoints.py
+++ b/tests/pipelines/test_face_2d_keypoints.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
         face_2d_keypoints_align = pipeline(
             task=Tasks.face_2d_keypoints, model=model_id)
-        output = face_2d_keypoints_align(img_path)[0]
+        output = face_2d_keypoints_align(img_path)
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
         output_pose = output[OutputKeys.POSES]
diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py
index 86cd2d06..43b569d0 100644
--- a/tests/pipelines/test_hand_2d_keypoints.py
+++ b/tests/pipelines/test_hand_2d_keypoints.py
@@ -15,10 +15,8 @@ class Hand2DKeypointsPipelineTest(unittest.TestCase):
         model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'
 
         hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id)
-        outputs = hand_keypoint(img_path)
-        self.assertEqual(len(outputs), 1)
+        results = hand_keypoint(img_path)
 
-        results = outputs[0]
         self.assertIn(OutputKeys.KEYPOINTS, results.keys())
         self.assertIn(OutputKeys.BOXES, results.keys())
         self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
@@ -30,10 +28,7 @@ class Hand2DKeypointsPipelineTest(unittest.TestCase):
         img_path = 'data/test/images/hand_keypoints.jpg'
 
         hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints)
-        outputs = hand_keypoint(img_path)
-        self.assertEqual(len(outputs), 1)
-
-        results = outputs[0]
+        results = hand_keypoint(img_path)
         self.assertIn(OutputKeys.KEYPOINTS, results.keys())
         self.assertIn(OutputKeys.BOXES, results.keys())
         self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21)
diff --git a/tests/pipelines/test_human_wholebody_keypoint.py b/tests/pipelines/test_human_wholebody_keypoint.py
index b214f4e1..7c5946cc 100644
--- a/tests/pipelines/test_human_wholebody_keypoint.py
+++ b/tests/pipelines/test_human_wholebody_keypoint.py
@@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase):
 
         human_wholebody_keypoint_pipeline = pipeline(
             task=Tasks.human_wholebody_keypoint, model=model_id)
-        output = human_wholebody_keypoint_pipeline(img_path)[0]
+        output = human_wholebody_keypoint_pipeline(img_path)
 
         output_keypoints = output[OutputKeys.KEYPOINTS]
         output_pose = output[OutputKeys.BOXES]
diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py
index 00a71371..64766c77 100644
--- a/tests/pipelines/test_object_detection.py
+++ b/tests/pipelines/test_object_detection.py
@@ -55,7 +55,7 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         image_object_detection_auto = pipeline(
             Tasks.image_object_detection, model=model_id)
 
-        result = image_object_detection_auto(test_image)[0]
+        result = image_object_detection_auto(test_image)
         image_object_detection_auto.show_result(test_image, result,
                                                 'auto_demo_ret.jpg')
 

From ac07b719e9b83c5da6c108e75e0767211f343016 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 17 Oct 2022 20:51:58 +0800
Subject: [PATCH 057/129] [to #45546922]feat: add fasttext package        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10431169

    * [to #45546922]feat: add fasttext package
---
 docker/Dockerfile.ubuntu | 2 +-
 modelscope/hub/errors.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index a9a409b5..6dafbc3e 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
 ENV SHELL=/bin/bash
 
 # install special package
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl
 
 RUN if [ "$USE_GPU" = "True" ] ; then \
         pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index fb483287..bd7a20ac 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -53,8 +53,8 @@ def handle_http_post_error(response, url, request_body):
     try:
         response.raise_for_status()
     except HTTPError as error:
-        logger.error('Request %s with body: %s exception, respoonse body: %s' %
-                     (url, request_body, response.body))
+        logger.error('Request %s with body: %s exception' %
+                     (url, request_body))
         raise error
 
 
From 271e2a2a9916de3bd64e40dd4c836d341fed4b77 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Mon, 17 Oct 2022 20:54:29 +0800
Subject: [PATCH 058/129] [to #42322933] Add gpt_neo model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 添加 gpt_neo 模型，因 checkpoint 归属于 Langboat 还未上传到模型库，已线下完成测试
2. 添加 text-generation task models 与 head，后续会将 gpt3，palm 等已上线文本生成模型统一为 backbone + head 结构的 task models
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10404249
---
 modelscope/metainfo.py                        |  5 ++
 modelscope/models/nlp/__init__.py             |  4 +-
 modelscope/models/nlp/backbones/gpt_neo.py    | 15 ++++
 .../models/nlp/heads/text_generation_head.py  | 35 ++++++++
 modelscope/models/nlp/task_models/__init__.py |  2 +
 .../models/nlp/task_models/text_generation.py | 79 +++++++++++++++++++
 .../pipelines/nlp/text_generation_pipeline.py | 38 ++++++---
 modelscope/preprocessors/__init__.py          |  2 +
 modelscope/preprocessors/nlp/__init__.py      |  2 +
 modelscope/preprocessors/nlp/nlp_base.py      | 21 +++++
 tests/pipelines/test_text_generation.py       | 13 +++
 tests/utils/test_ast.py                       |  2 +-
 12 files changed, 207 insertions(+), 11 deletions(-)
 create mode 100644 modelscope/models/nlp/backbones/gpt_neo.py
 create mode 100644 modelscope/models/nlp/heads/text_generation_head.py
 create mode 100644 modelscope/models/nlp/task_models/text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 2e3fed98..fb99bc71 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -71,6 +71,7 @@ class Models(object):
     gcnncrf = 'gcnn-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
+    gpt_neo = 'gpt-neo'
     plug = 'plug'
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
@@ -101,6 +102,7 @@ class TaskModels(object):
     information_extraction = 'information-extraction'
     fill_mask = 'fill-mask'
     feature_extraction = 'feature-extraction'
+    text_generation = 'text-generation'
 
 
 class Heads(object):
@@ -116,6 +118,8 @@ class Heads(object):
     token_classification = 'token-classification'
     # extraction
     information_extraction = 'information-extraction'
+    # text gen
+    text_generation = 'text-generation'
 
 
 class Pipelines(object):
@@ -341,6 +345,7 @@ class Preprocessors(object):
     re_tokenizer = 're-tokenizer'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
+    sentence_piece = 'sentence-piece'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 8ef96365..9e830d17 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -30,7 +30,8 @@ if TYPE_CHECKING:
                               InformationExtractionModel,
                               SequenceClassificationModel,
                               SingleBackboneTaskModelBase,
-                              TokenClassificationModel)
+                              TokenClassificationModel,
+                              TaskModelForTextGeneration)
     from .token_classification import SbertForTokenClassification
     from .sentence_embedding import SentenceEmbedding
     from .passage_ranking import PassageRanking
@@ -69,6 +70,7 @@ else:
             'SequenceClassificationModel',
             'SingleBackboneTaskModelBase',
             'TokenClassificationModel',
+            'TaskModelForTextGeneration',
         ],
         'token_classification': ['SbertForTokenClassification'],
         'table_question_answering': ['TableQuestionAnswering'],
diff --git a/modelscope/models/nlp/backbones/gpt_neo.py b/modelscope/models/nlp/backbones/gpt_neo.py
new file mode 100644
index 00000000..a2d0c374
--- /dev/null
+++ b/modelscope/models/nlp/backbones/gpt_neo.py
@@ -0,0 +1,15 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import GPTNeoConfig
+from transformers import GPTNeoModel as GPTNeoModelTransform
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils.constant import Fields
+
+
+@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo)
+class GPTNeoModel(GPTNeoModelTransform):
+
+    def __init__(self, **kwargs):
+        config = GPTNeoConfig(**kwargs)
+        super().__init__(config)
diff --git a/modelscope/models/nlp/heads/text_generation_head.py b/modelscope/models/nlp/heads/text_generation_head.py
new file mode 100644
index 00000000..606d5a1f
--- /dev/null
+++ b/modelscope/models/nlp/heads/text_generation_head.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.metainfo import Heads
+from modelscope.models.base import TorchHead
+from modelscope.models.builder import HEADS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@HEADS.register_module(
+    Tasks.text_generation, module_name=Heads.text_generation)
+class TextGenerationHead(TorchHead):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        config = self.config
+        self.linear = nn.Linear(
+            config['hidden_size'], config['vocab_size'], bias=False)
+
+    def get_output_embeddings(self):
+        return self.linear
+
+    def forward(self, inputs=None):
+        logits = self.linear(inputs)
+        return {OutputKeys.LOGITS: logits}
+
+    def compute_loss(self, outputs: Dict[str, torch.Tensor],
+                     labels) -> Dict[str, torch.Tensor]:
+        logits = outputs[OutputKeys.LOGITS]
+        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 90f22aa1..38359044 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
+    from .text_generation import TaskModelForTextGeneration
 
 else:
     _import_structure = {
@@ -19,6 +20,7 @@ else:
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
+        'text_generation': ['TaskModelForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py
new file mode 100644
index 00000000..973198ae
--- /dev/null
+++ b/modelscope/models/nlp/task_models/text_generation.py
@@ -0,0 +1,79 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import addict
+import numpy as np
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.metainfo import TaskModels
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.task_models.task_model import \
+    SingleBackboneTaskModelBase
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+__all__ = ['TaskModelForTextGeneration']
+
+
+@MODELS.register_module(
+    Tasks.text_generation, module_name=TaskModels.text_generation)
+class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        if 'base_model_prefix' in kwargs:
+            self._base_model_prefix = kwargs['base_model_prefix']
+
+        self.build_backbone(self.backbone_cfg)
+        self.build_head(self.head_cfg)
+        if self.config.get('shared_embedding', False):
+            input_embeddings = self.backbone.get_input_embeddings()
+            output_embeddings = self.head.get_output_embeddings()
+            output_embeddings.weight = input_embeddings.weight
+
+    def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        # backbone do not need labels, only head need for loss compute
+        labels = input.pop(OutputKeys.LABELS, None)
+
+        backbone_outputs = super().forward(input)
+        hidden_states = backbone_outputs[0]
+
+        outputs = self.head.forward(hidden_states)
+        if labels is not None:
+            input[OutputKeys.LABELS] = labels
+            loss = self.compute_loss(outputs, labels)
+            outputs.update(loss)
+        return addict.Dict(outputs)
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        token_type_ids = kwargs.get('token_type_ids', None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get('attention_mask', None)
+        position_ids = kwargs.get('position_ids', None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        return {
+            'input_ids': input_ids,
+            'past_key_values': past,
+            'use_cache': kwargs.get('use_cache'),
+            'position_ids': position_ids,
+            'attention_mask': attention_mask,
+            'token_type_ids': token_type_ids,
+        }
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index ea35763f..ae92f26a 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -6,10 +6,12 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import TextGenerationPreprocessor
-from modelscope.utils.constant import Tasks
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.hub import read_config
 
 __all__ = ['TextGenerationPipeline']
 
@@ -20,7 +22,7 @@ class TextGenerationPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
-                 preprocessor: Optional[TextGenerationPreprocessor] = None,
+                 preprocessor: Optional[Preprocessor] = None,
                  first_sequence='sentence',
                  **kwargs):
         """Use `model` and `preprocessor` to create a generation pipeline for prediction.
@@ -50,19 +52,34 @@ class TextGenerationPipeline(Pipeline):
         """
         model = model if isinstance(model,
                                     Model) else Model.from_pretrained(model)
+        cfg = read_config(model.model_dir)
+        self.postprocessor = cfg.pop('postprocessor', None)
         if preprocessor is None:
-            preprocessor = TextGenerationPreprocessor(
+            preprocessor_cfg = cfg.preprocessor
+            preprocessor_cfg.update({
+                'model_dir':
                 model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=None,
-                sequence_length=kwargs.pop('sequence_length', 128))
+                'first_sequence':
+                first_sequence,
+                'second_sequence':
+                None,
+                'sequence_length':
+                kwargs.pop('sequence_length', 128)
+            })
+            preprocessor = build_preprocessor(preprocessor_cfg, Fields.nlp)
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return {}, pipeline_parameters, {}
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model.generate(inputs)
+            return self.model.generate(inputs, **forward_params)
+
+    def sentence_piece(self, inputs) -> Dict[str, Tensor]:
+        return self.preprocessor.tokenizer.decode(inputs.tolist())[0]
 
     def postprocess(self, inputs: Dict[str, Tensor],
                     **postprocess_params) -> Dict[str, str]:
@@ -74,4 +91,7 @@ class TextGenerationPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        return inputs
+        return inputs if self.postprocessor is None else {
+            OutputKeys.TEXT:
+            getattr(self, self.postprocessor.replace('-', '_'))(inputs)
+        }
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 90303b65..43fa64a7 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        SentencePiecePreprocessor,
     )
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -71,6 +72,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'SentencePiecePreprocessor',
         ],
         'space': [
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index dfbb5c81..a753fe6c 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        SentencePiecePreprocessor,
     )
 
 else:
@@ -41,6 +42,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'SentencePiecePreprocessor',
         ],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index bec7e4e1..3d708634 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -5,6 +5,7 @@ import re
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
 import numpy as np
+import sentencepiece as spm
 import torch
 from transformers import AutoTokenizer
 
@@ -1160,3 +1161,23 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
 
         self.labels_to_id(labels, output)
         return output
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_piece)
+class SentencePiecePreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        import os
+
+        super().__init__(*args, **kwargs)
+        self.tokenizer = None
+        for file_name in os.listdir(model_dir):
+            if file_name.endswith('.model'):
+                m_file = osp.join(model_dir, file_name)
+                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
+                break
+        assert self.tokenizer is not None, 'Can not find .model file'
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index 66f9c9da..5a270f83 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -133,6 +133,19 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_demo_compatibility(self):
         self.compatibility_check()
 
+    @unittest.skip("Langboat's checkpoint has not been uploaded to modelhub")
+    def test_gpt_neo(self):
+        pipe = pipeline(
+            task=Tasks.text_generation, model='Langboat/mengzi-gpt-neo-base')
+        print(
+            pipe(
+                '我是',
+                do_sample=True,
+                top_k=5,
+                top_p=1,
+                max_length=20,
+                repetition_penalty=0.5))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index 9a8ab828..c0624679 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -41,7 +41,7 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(from_imports, dict)
         self.assertIsInstance(decorators, list)
         self.assertListEqual(list(set(imports.keys()) - set(['torch'])), [])
-        self.assertEqual(len(from_imports.keys()), 7)
+        self.assertEqual(len(from_imports.keys()), 9)
         self.assertTrue(from_imports['modelscope.metainfo'] is not None)
         self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines'])
         self.assertEqual(decorators,

From 172522d19654a9e6c3d872170753086cf2452411 Mon Sep 17 00:00:00 2001
From: "leyuan.hjy" <leyuan.hjy@alibaba-inc.com>
Date: Mon, 17 Oct 2022 20:58:23 +0800
Subject: [PATCH 059/129] [to #42322933]video-object-detection init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增video-object-detection 算法
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10247489
---
 data/test/videos/test_realtime_vod.mp4        |   3 +
 modelscope/metainfo.py                        |   2 +
 .../cv/realtime_object_detection/__init__.py  |   2 +
 .../realtime_video_detector.py                | 117 +++++++
 .../yolox/exp/build.py                        |   2 +
 .../yolox/exp/default/__init__.py             |   2 +-
 .../yolox/exp/default/streamyolo.py           |  43 +++
 .../yolox/exp/yolox_base.py                   |   1 -
 .../yolox/models/__init__.py                  |   3 +
 .../yolox/models/dfp_pafpn.py                 | 307 ++++++++++++++++++
 .../yolox/models/network_blocks.py            |   1 -
 .../yolox/models/streamyolo.py                |  41 +++
 .../yolox/models/tal_head.py                  | 170 ++++++++++
 modelscope/outputs.py                         |  31 +-
 ...ealtime_video_object_detection_pipeline.py |  59 ++++
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/cv/image_utils.py            |  60 ++++
 .../test_realtime_video_object_detection.py   |  46 +++
 18 files changed, 886 insertions(+), 5 deletions(-)
 create mode 100644 data/test/videos/test_realtime_vod.mp4
 create mode 100644 modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
 create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
 create mode 100644 modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
 create mode 100644 tests/pipelines/test_realtime_video_object_detection.py

diff --git a/data/test/videos/test_realtime_vod.mp4 b/data/test/videos/test_realtime_vod.mp4
new file mode 100644
index 00000000..a0e44852
--- /dev/null
+++ b/data/test/videos/test_realtime_vod.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87
+size 4068783
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index fb99bc71..e4a26303 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -14,6 +14,7 @@ class Models(object):
     # vision models
     detection = 'detection'
     realtime_object_detection = 'realtime-object-detection'
+    realtime_video_object_detection = 'realtime-video-object-detection'
     scrfd = 'scrfd'
     classification_model = 'ClassificationModel'
     nafnet = 'nafnet'
@@ -170,6 +171,7 @@ class Pipelines(object):
     face_image_generation = 'gan-face-image-generation'
     product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
     realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
+    realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
     face_recognition = 'ir101-face-recognition-cfglint'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/realtime_object_detection/__init__.py
index aed13cec..66156977 100644
--- a/modelscope/models/cv/realtime_object_detection/__init__.py
+++ b/modelscope/models/cv/realtime_object_detection/__init__.py
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .realtime_detector import RealtimeDetector
+    from .realtime_video_detector import RealtimeVideoDetector
 else:
     _import_structure = {
         'realtime_detector': ['RealtimeDetector'],
+        'realtime_video_detector': ['RealtimeVideoDetector'],
     }
 
     import sys
diff --git a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
new file mode 100644
index 00000000..fc7339b3
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
@@ -0,0 +1,117 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import logging as logger
+import os
+import os.path as osp
+import time
+
+import cv2
+import json
+import torch
+from tqdm import tqdm
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .yolox.data.data_augment import ValTransform
+from .yolox.exp import get_exp_by_name
+from .yolox.utils import postprocess
+
+
+@MODELS.register_module(
+    group_key=Tasks.video_object_detection,
+    module_name=Models.realtime_video_object_detection)
+class RealtimeVideoDetector(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        # model type
+        self.exp = get_exp_by_name(self.config.model_type)
+
+        # build model
+        self.model = self.exp.get_model()
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        ckpt = torch.load(model_path, map_location='cpu')
+
+        # load the model state dict
+        self.model.load_state_dict(ckpt['model'])
+        self.model.eval()
+
+        # params setting
+        self.exp.num_classes = self.config.num_classes
+        self.confthre = self.config.conf_thr
+        self.num_classes = self.exp.num_classes
+        self.nmsthre = self.exp.nmsthre
+        self.test_size = self.exp.test_size
+        self.preproc = ValTransform(legacy=False)
+        self.current_buffer = None
+        self.label_mapping = self.config['labels']
+
+    def inference(self, img):
+        with torch.no_grad():
+            outputs, self.current_buffer = self.model(
+                img, buffer=self.current_buffer, mode='on_pipe')
+        return outputs
+
+    def forward(self, inputs):
+        return self.inference_video(inputs)
+
+    def preprocess(self, img):
+        img = LoadImage.convert_to_ndarray(img)
+        height, width = img.shape[:2]
+        self.ratio = min(self.test_size[0] / img.shape[0],
+                         self.test_size[1] / img.shape[1])
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+
+        # Video decoding and preprocessing automatically are not supported by Pipeline/Model
+        # Sending preprocessed video frame tensor to GPU buffer self-adaptively
+        if next(self.model.parameters()).is_cuda:
+            img = img.to(next(self.model.parameters()).device)
+        return img
+
+    def postprocess(self, input):
+        outputs = postprocess(
+            input,
+            self.num_classes,
+            self.confthre,
+            self.nmsthre,
+            class_agnostic=True)
+
+        if len(outputs) == 1:
+            bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
+            scores = outputs[0][:, 5].cpu().numpy()
+            labels = outputs[0][:, 6].cpu().int().numpy()
+            pred_label_names = []
+            for lab in labels:
+                pred_label_names.append(self.label_mapping[lab])
+
+        return bboxes, scores, pred_label_names
+
+    def inference_video(self, v_path):
+        outputs = []
+        desc = 'Detecting video: {}'.format(v_path)
+        for frame, result in tqdm(
+                self.inference_video_iter(v_path), desc=desc):
+            outputs.append(result)
+
+        return outputs
+
+    def inference_video_iter(self, v_path):
+        capture = cv2.VideoCapture(v_path)
+        while capture.isOpened():
+            ret, frame = capture.read()
+            if not ret:
+                break
+            output = self.preprocess(frame)
+            output = self.inference(output)
+            output = self.postprocess(output)
+            yield frame, output
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
index 4858100c..5865c53b 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py
@@ -13,6 +13,8 @@ def get_exp_by_name(exp_name):
         from .default import YoloXNanoExp as YoloXExp
     elif exp == 'yolox_tiny':
         from .default import YoloXTinyExp as YoloXExp
+    elif exp == 'streamyolo':
+        from .default import StreamYoloExp as YoloXExp
     else:
         pass
     return YoloXExp()
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
index 552bbccd..cfec836c 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py
@@ -1,5 +1,5 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
+from .streamyolo import StreamYoloExp
 from .yolox_nano import YoloXNanoExp
 from .yolox_s import YoloXSExp
 from .yolox_tiny import YoloXTinyExp
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
new file mode 100644
index 00000000..5a62c8fc
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py
@@ -0,0 +1,43 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import os
+import sys
+
+import torch
+
+from ..yolox_base import Exp as YoloXExp
+
+
+class StreamYoloExp(YoloXExp):
+
+    def __init__(self):
+        super(YoloXExp, self).__init__()
+        self.depth = 1.0
+        self.width = 1.0
+        self.num_classes = 8
+        self.test_size = (600, 960)
+        self.test_conf = 0.3
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from ...models import StreamYOLO, DFPPAFPN, TALHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, 'model', None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = DFPPAFPN(
+                self.depth, self.width, in_channels=in_channels)
+            head = TALHead(
+                self.num_classes,
+                self.width,
+                in_channels=in_channels,
+                gamma=1.0,
+                ignore_thr=0.5,
+                ignore_value=1.6)
+            self.model = StreamYOLO(backbone, head)
+
+        return self.model
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
index a2a41535..c5159a9f 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py
@@ -1,5 +1,4 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
 import os
 import random
 
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
index 20b1a0d1..d2e889f1 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py
@@ -1,6 +1,9 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
 
 from .darknet import CSPDarknet, Darknet
+from .dfp_pafpn import DFPPAFPN
+from .streamyolo import StreamYOLO
+from .tal_head import TALHead
 from .yolo_fpn import YOLOFPN
 from .yolo_head import YOLOXHead
 from .yolo_pafpn import YOLOPAFPN
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
new file mode 100644
index 00000000..01284791
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py
@@ -0,0 +1,307 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class DFPPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self,
+        depth=1.0,
+        width=1.0,
+        in_features=('dark3', 'dark4', 'dark5'),
+        in_channels=[256, 512, 1024],
+        depthwise=False,
+        act='silu',
+    ):
+        super().__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width),
+            int(in_channels[1] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width),
+            int(in_channels[0] * width),
+            1,
+            1,
+            act=act)
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(
+            int(in_channels[0] * width),
+            int(in_channels[0] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(
+            int(in_channels[1] * width),
+            int(in_channels[1] * width),
+            3,
+            2,
+            act=act)
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+            act=act,
+        )
+
+        self.jian2 = Conv(
+            in_channels=int(in_channels[0] * width),
+            out_channels=int(in_channels[0] * width) // 2,
+            ksize=1,
+            stride=1,
+            act=act,
+        )
+
+        self.jian1 = Conv(
+            in_channels=int(in_channels[1] * width),
+            out_channels=int(in_channels[1] * width) // 2,
+            ksize=1,
+            stride=1,
+            act=act,
+        )
+
+        self.jian0 = Conv(
+            in_channels=int(in_channels[2] * width),
+            out_channels=int(in_channels[2] * width) // 2,
+            ksize=1,
+            stride=1,
+            act=act,
+        )
+
+    def off_forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        rurrent_out_features = self.backbone(torch.split(input, 3, dim=1)[0])
+        rurrent_features = [rurrent_out_features[f] for f in self.in_features]
+        [rurrent_x2, rurrent_x1, rurrent_x0] = rurrent_features
+
+        rurrent_fpn_out0 = self.lateral_conv0(rurrent_x0)  # 1024->512/32
+        rurrent_f_out0 = F.interpolate(
+            rurrent_fpn_out0, size=rurrent_x1.shape[2:4],
+            mode='nearest')  # 512/16
+        rurrent_f_out0 = torch.cat([rurrent_f_out0, rurrent_x1],
+                                   1)  # 512->1024/16
+        rurrent_f_out0 = self.C3_p4(rurrent_f_out0)  # 1024->512/16
+
+        rurrent_fpn_out1 = self.reduce_conv1(rurrent_f_out0)  # 512->256/16
+        rurrent_f_out1 = F.interpolate(
+            rurrent_fpn_out1, size=rurrent_x2.shape[2:4],
+            mode='nearest')  # 256/8
+        rurrent_f_out1 = torch.cat([rurrent_f_out1, rurrent_x2],
+                                   1)  # 256->512/8
+        rurrent_pan_out2 = self.C3_p3(rurrent_f_out1)  # 512->256/8
+
+        rurrent_p_out1 = self.bu_conv2(rurrent_pan_out2)  # 256->256/16
+        rurrent_p_out1 = torch.cat([rurrent_p_out1, rurrent_fpn_out1],
+                                   1)  # 256->512/16
+        rurrent_pan_out1 = self.C3_n3(rurrent_p_out1)  # 512->512/16
+
+        rurrent_p_out0 = self.bu_conv1(rurrent_pan_out1)  # 512->512/32
+        rurrent_p_out0 = torch.cat([rurrent_p_out0, rurrent_fpn_out0],
+                                   1)  # 512->1024/32
+        rurrent_pan_out0 = self.C3_n4(rurrent_p_out0)  # 1024->1024/32
+
+        #####
+
+        support_out_features = self.backbone(torch.split(input, 3, dim=1)[1])
+        support_features = [support_out_features[f] for f in self.in_features]
+        [support_x2, support_x1, support_x0] = support_features
+
+        support_fpn_out0 = self.lateral_conv0(support_x0)  # 1024->512/32
+        support_f_out0 = F.interpolate(
+            support_fpn_out0, size=support_x1.shape[2:4],
+            mode='nearest')  # 512/16
+        support_f_out0 = torch.cat([support_f_out0, support_x1],
+                                   1)  # 512->1024/16
+        support_f_out0 = self.C3_p4(support_f_out0)  # 1024->512/16
+
+        support_fpn_out1 = self.reduce_conv1(support_f_out0)  # 512->256/16
+        support_f_out1 = F.interpolate(
+            support_fpn_out1, size=support_x2.shape[2:4],
+            mode='nearest')  # 256/8
+        support_f_out1 = torch.cat([support_f_out1, support_x2],
+                                   1)  # 256->512/8
+        support_pan_out2 = self.C3_p3(support_f_out1)  # 512->256/8
+
+        support_p_out1 = self.bu_conv2(support_pan_out2)  # 256->256/16
+        support_p_out1 = torch.cat([support_p_out1, support_fpn_out1],
+                                   1)  # 256->512/16
+        support_pan_out1 = self.C3_n3(support_p_out1)  # 512->512/16
+
+        support_p_out0 = self.bu_conv1(support_pan_out1)  # 512->512/32
+        support_p_out0 = torch.cat([support_p_out0, support_fpn_out0],
+                                   1)  # 512->1024/32
+        support_pan_out0 = self.C3_n4(support_p_out0)  # 1024->1024/32
+
+        # 0.5 channel
+        pan_out2 = torch.cat(
+            [self.jian2(rurrent_pan_out2),
+             self.jian2(support_pan_out2)],
+            dim=1) + rurrent_pan_out2
+        pan_out1 = torch.cat(
+            [self.jian1(rurrent_pan_out1),
+             self.jian1(support_pan_out1)],
+            dim=1) + rurrent_pan_out1
+        pan_out0 = torch.cat(
+            [self.jian0(rurrent_pan_out0),
+             self.jian0(support_pan_out0)],
+            dim=1) + rurrent_pan_out0
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+
+        return outputs
+
+    def online_forward(self, input, buffer=None, node='star'):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        rurrent_out_features = self.backbone(input)
+        rurrent_features = [rurrent_out_features[f] for f in self.in_features]
+        [rurrent_x2, rurrent_x1, rurrent_x0] = rurrent_features
+
+        rurrent_fpn_out0 = self.lateral_conv0(rurrent_x0)  # 1024->512/32
+        rurrent_f_out0 = F.interpolate(
+            rurrent_fpn_out0, size=rurrent_x1.shape[2:4],
+            mode='nearest')  # 512/16
+        rurrent_f_out0 = torch.cat([rurrent_f_out0, rurrent_x1],
+                                   1)  # 512->1024/16
+        rurrent_f_out0 = self.C3_p4(rurrent_f_out0)  # 1024->512/16
+
+        rurrent_fpn_out1 = self.reduce_conv1(rurrent_f_out0)  # 512->256/16
+        rurrent_f_out1 = F.interpolate(
+            rurrent_fpn_out1, size=rurrent_x2.shape[2:4],
+            mode='nearest')  # 256/8
+        rurrent_f_out1 = torch.cat([rurrent_f_out1, rurrent_x2],
+                                   1)  # 256->512/8
+        rurrent_pan_out2 = self.C3_p3(rurrent_f_out1)  # 512->256/8
+
+        rurrent_p_out1 = self.bu_conv2(rurrent_pan_out2)  # 256->256/16
+        rurrent_p_out1 = torch.cat([rurrent_p_out1, rurrent_fpn_out1],
+                                   1)  # 256->512/16
+        rurrent_pan_out1 = self.C3_n3(rurrent_p_out1)  # 512->512/16
+
+        rurrent_p_out0 = self.bu_conv1(rurrent_pan_out1)  # 512->512/32
+        rurrent_p_out0 = torch.cat([rurrent_p_out0, rurrent_fpn_out0],
+                                   1)  # 512->1024/32
+        rurrent_pan_out0 = self.C3_n4(rurrent_p_out0)  # 1024->1024/32
+
+        #####
+        if node == 'star':
+            pan_out2 = torch.cat(
+                [self.jian2(rurrent_pan_out2),
+                 self.jian2(rurrent_pan_out2)],
+                dim=1) + rurrent_pan_out2
+            pan_out1 = torch.cat(
+                [self.jian1(rurrent_pan_out1),
+                 self.jian1(rurrent_pan_out1)],
+                dim=1) + rurrent_pan_out1
+            pan_out0 = torch.cat(
+                [self.jian0(rurrent_pan_out0),
+                 self.jian0(rurrent_pan_out0)],
+                dim=1) + rurrent_pan_out0
+        elif node == 'buffer':
+
+            [support_pan_out2, support_pan_out1, support_pan_out0] = buffer
+
+            pan_out2 = torch.cat(
+                [self.jian2(rurrent_pan_out2),
+                 self.jian2(support_pan_out2)],
+                dim=1) + rurrent_pan_out2
+            pan_out1 = torch.cat(
+                [self.jian1(rurrent_pan_out1),
+                 self.jian1(support_pan_out1)],
+                dim=1) + rurrent_pan_out1
+            pan_out0 = torch.cat(
+                [self.jian0(rurrent_pan_out0),
+                 self.jian0(support_pan_out0)],
+                dim=1) + rurrent_pan_out0
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+
+        buffer_ = (rurrent_pan_out2, rurrent_pan_out1, rurrent_pan_out0)
+
+        return outputs, buffer_
+
+    def forward(self, input, buffer=None, mode='off_pipe'):
+
+        if mode == 'off_pipe':
+            # Glops caculate mode
+            if input.size()[1] == 3:
+                input = torch.cat([input, input], dim=1)
+                output = self.off_forward(input)
+            # offline train mode
+            elif input.size()[1] == 6:
+                output = self.off_forward(input)
+
+            return output
+
+        elif mode == 'on_pipe':
+            # online star state
+            if buffer is None:
+                output, buffer_ = self.online_forward(input, node='star')
+            # online inference
+            else:
+                assert len(buffer) == 3
+                assert input.size()[1] == 3
+                output, buffer_ = self.online_forward(
+                    input, buffer=buffer, node='buffer')
+
+            return output, buffer_
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
index fd15c1c1..88bd55c7 100644
--- a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py
@@ -1,5 +1,4 @@
 # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
-
 import torch
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py b/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
new file mode 100644
index 00000000..b3ec3504
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py
@@ -0,0 +1,41 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import torch.nn as nn
+
+from .dfp_pafpn import DFPPAFPN
+from .tal_head import TALHead
+
+
+class StreamYOLO(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is None:
+            backbone = DFPPAFPN()
+        if head is None:
+            head = TALHead(20)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None, buffer=None, mode='off_pipe'):
+        # fpn output content features of [dark3, dark4, dark5]
+        assert mode in ['off_pipe', 'on_pipe']
+
+        if mode == 'off_pipe':
+            fpn_outs = self.backbone(x, buffer=buffer, mode='off_pipe')
+            if self.training:
+                pass
+            else:
+                outputs = self.head(fpn_outs, imgs=x)
+
+            return outputs
+        elif mode == 'on_pipe':
+            fpn_outs, buffer_ = self.backbone(x, buffer=buffer, mode='on_pipe')
+            outputs = self.head(fpn_outs)
+
+            return outputs, buffer_
diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
new file mode 100644
index 00000000..7a82f8c6
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py
@@ -0,0 +1,170 @@
+# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .network_blocks import BaseConv, DWConv
+
+
+class TALHead(nn.Module):
+
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act='silu',
+        depthwise=False,
+        gamma=1.5,
+        ignore_thr=0.2,
+        ignore_value=0.2,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+
+        self.gamma = gamma
+        self.ignore_thr = ignore_thr
+        self.ignore_value = ignore_value
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                ))
+            self.cls_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.reg_convs.append(
+                nn.Sequential(*[
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                    Conv(
+                        in_channels=int(256 * width),
+                        out_channels=int(256 * width),
+                        ksize=3,
+                        stride=1,
+                        act=act,
+                    ),
+                ]))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ))
+
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+        self.expanded_strides = [None] * len(in_channels)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+                zip(self.cls_convs, self.reg_convs, self.strides, xin)):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                pass
+
+            else:
+                output = torch.cat(
+                    [reg_output,
+                     obj_output.sigmoid(),
+                     cls_output.sigmoid()], 1)
+
+            outputs.append(output)
+
+        if self.training:
+            pass
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs],
+                                dim=2).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        return outputs
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index c08779b4..a49ddacf 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -165,6 +165,32 @@ TASK_OUTPUTS = {
     Tasks.image_object_detection:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
 
+    # video object detection result for single sample
+    #   {
+
+    #         "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
+    #         "labels": [["person", "traffic light", "car", "bus"],
+    #                     ["person", "traffic light", "car", "bus"]]
+    #         "boxes":
+    #          [
+    #              [
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #              ],
+    #              [
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #                [x1, y1, x2, y2],
+    #               ]
+    #           ],
+
+    #   }
+    Tasks.video_object_detection:
+    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
+
     # instance segmentation result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05],
@@ -676,8 +702,9 @@ TASK_OUTPUTS = {
     #   "text_embedding": np.array with shape [1, D],
     #   "similarity": float
     # }
-    Tasks.multi_modal_similarity:
-    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
+    Tasks.multi_modal_similarity: [
+        OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES
+    ],
 
     # VQA result for a sample
     # {"text": "this is a text answser. "}
diff --git a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
new file mode 100644
index 00000000..3686c50a
--- /dev/null
+++ b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict, List, Union
+
+import cv2
+import json
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.realtime_object_detection import \
+    RealtimeVideoDetector
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Model, Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_object_detection,
+    module_name=Pipelines.realtime_video_object_detection)
+class RealtimeVideoObjectDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        super().__init__(model=model, **kwargs)
+        self.model = RealtimeVideoDetector(model)
+
+    def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]:
+        return input
+
+    def forward(self, input: Input) -> Dict[Tensor, Dict[str, np.ndarray]]:
+        self.video_path = input
+        # Processing the whole video and return results for each frame
+        forward_output = self.model.inference_video(self.video_path)
+        return {'forward_output': forward_output}
+
+    def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]],
+                    **kwargs) -> str:
+        forward_output = input['forward_output']
+
+        scores, boxes, labels = [], [], []
+        for result in forward_output:
+            box, score, label = result
+            scores.append(score)
+            boxes.append(box)
+            labels.append(label)
+
+        return {
+            OutputKeys.BOXES: boxes,
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+        }
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 9e10e802..0eb369da 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -38,6 +38,7 @@ class CVTasks(object):
     image_classification_dailylife = 'image-classification-dailylife'
 
     image_object_detection = 'image-object-detection'
+    video_object_detection = 'video-object-detection'
 
     image_segmentation = 'image-segmentation'
     semantic_segmentation = 'semantic-segmentation'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 2d420892..34dc2348 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -231,6 +231,66 @@ def show_video_tracking_result(video_in_path, bboxes, video_save_path):
     cap.release()
 
 
+def show_video_object_detection_result(video_in_path, bboxes_list, labels_list,
+                                       video_save_path):
+
+    PALETTE = {
+        'person': [128, 0, 0],
+        'bicycle': [128, 128, 0],
+        'car': [64, 0, 0],
+        'motorcycle': [0, 128, 128],
+        'bus': [64, 128, 0],
+        'truck': [192, 128, 0],
+        'traffic light': [64, 0, 128],
+        'stop sign': [192, 0, 128],
+    }
+    from tqdm import tqdm
+    import math
+    cap = cv2.VideoCapture(video_in_path)
+    with tqdm(total=len(bboxes_list)) as pbar:
+        pbar.set_description(
+            'Writing results to video: {}'.format(video_save_path))
+        for i in range(len(bboxes_list)):
+            bboxes = bboxes_list[i].astype(int)
+            labels = labels_list[i]
+            success, frame = cap.read()
+            if success is False:
+                raise Exception(video_in_path,
+                                ' can not be correctly decoded by OpenCV.')
+            if i == 0:
+                size = (frame.shape[1], frame.shape[0])
+                fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+                video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                               cap.get(cv2.CAP_PROP_FPS), size,
+                                               True)
+
+            FONT_SCALE = 1e-3  # Adjust for larger font size in all images
+            THICKNESS_SCALE = 1e-3  # Adjust for larger thickness in all images
+            TEXT_Y_OFFSET_SCALE = 1e-2  # Adjust for larger Y-offset of text and bounding box
+            H, W, _ = frame.shape
+            zeros_mask = np.zeros((frame.shape)).astype(np.uint8)
+            for bbox, l in zip(bboxes, labels):
+                cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                              PALETTE[l], 1)
+                cv2.putText(
+                    frame,
+                    l, (bbox[0], bbox[1] - int(TEXT_Y_OFFSET_SCALE * H)),
+                    fontFace=cv2.FONT_HERSHEY_TRIPLEX,
+                    fontScale=min(H, W) * FONT_SCALE,
+                    thickness=math.ceil(min(H, W) * THICKNESS_SCALE),
+                    color=PALETTE[l])
+                zeros_mask = cv2.rectangle(
+                    zeros_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                    color=PALETTE[l],
+                    thickness=-1)
+
+            frame = cv2.addWeighted(frame, 1., zeros_mask, .65, 0)
+            video_writer.write(frame)
+            pbar.update(1)
+    video_writer.release
+    cap.release()
+
+
 def panoptic_seg_masks_to_image(masks):
     draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3])
     from mmdet.core.visualization.palette import get_palette
diff --git a/tests/pipelines/test_realtime_video_object_detection.py b/tests/pipelines/test_realtime_video_object_detection.py
new file mode 100644
index 00000000..d65313a3
--- /dev/null
+++ b/tests/pipelines/test_realtime_video_object_detection.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_object_detection_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class RealtimeVideoObjectDetectionTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_cspnet_video-object-detection_streamyolo'
+        self.test_video = 'data/test/videos/test_realtime_vod.mp4'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        realtime_video_object_detection = pipeline(
+            Tasks.video_object_detection, model=self.model_id)
+        result = realtime_video_object_detection(self.test_video)
+        if result:
+            logger.info('Video output to test_vod_results.avi')
+            show_video_object_detection_result(self.test_video,
+                                               result[OutputKeys.BOXES],
+                                               result[OutputKeys.LABELS],
+                                               'test_vod_results.avi')
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From e3eb01f4cee8c39a66c797cfdf8e29917011424a Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Mon, 17 Oct 2022 23:31:44 +0800
Subject: [PATCH 060/129] [to #42322933]update word-segmentation regression
 results         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10432186

---
 data/test/regression/sbert_ws_en.bin | 4 ++--
 data/test/regression/sbert_ws_zh.bin | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
index 4eb562d6..6e441f7f 100644
--- a/data/test/regression/sbert_ws_en.bin
+++ b/data/test/regression/sbert_ws_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
-size 60801
+oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
+size 61239
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
index 555f640d..b1841351 100644
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
-size 60801
+oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
+size 61115

From 2eb835aca489f5b7dcdfb6199d34f1bfc85f6d7c Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Tue, 18 Oct 2022 11:12:12 +0800
Subject: [PATCH 061/129] [to #42322933]Add uuid to model which created by ut
 test

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10434107
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10434107

    * [Update] update finetune_result_upload

* [Update] rename finetune_result_upload to model_dir_upload

* Merge branch 'master' into feat/upload_ckpt

* Merge branch 'master' into feat/upload_ckpt

* [Fix] fix import error

* [Fix] fix import error

* Merge branch 'master' into feat/upload_ckpt

* [Update] changes name to upload_folder and using tempfile to save repo

* Merge branch 'master' into feat/upload_ckpt

* [Fix] fix commit

* Merge branch 'master' into feat/upload_ckpt

* [Fix] fix format

* Merge branch 'master' into feat/upload_ckpt

* [Fix] add uuid after model created from upload ut
---
 tests/hub/test_hub_upload.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
index d7e6e439..2250164b 100644
--- a/tests/hub/test_hub_upload.py
+++ b/tests/hub/test_hub_upload.py
@@ -3,6 +3,7 @@ import os
 import shutil
 import tempfile
 import unittest
+import uuid
 
 from modelscope.hub.api import HubApi
 from modelscope.hub.constants import Licenses, ModelVisibility
@@ -23,7 +24,9 @@ class HubUploadTest(unittest.TestCase):
         self.api = HubApi()
         self.user = os.environ.get('TEST_MODEL_ORG', 'citest')
         logger.info(self.user)
-        self.create_model_name = '%s/%s' % (self.user, 'test_model_upload')
+        self.create_model_name = '%s/%s_%s' % (self.user, 'test_model_upload',
+                                               uuid.uuid4().hex)
+        logger.info('create %s' % self.create_model_name)
         temporary_dir = tempfile.mkdtemp()
         self.work_dir = temporary_dir
         self.model_dir = os.path.join(temporary_dir, self.create_model_name)

From c0b546a96eaaaaef2e9ab1bf32b1abe9092d33e1 Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Tue, 18 Oct 2022 14:34:26 +0800
Subject: [PATCH 062/129] [to #42322933]add subset_name when loading dataset
 (NAFNet image denoising)         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10427797

---
 tests/trainers/test_image_denoise_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index 0bcb8930..68ddf616 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -33,11 +33,13 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         dataset_train = MsDataset.load(
             'SIDD',
             namespace='huizheng',
+            subset_name='default',
             split='validation',
             download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
         dataset_val = MsDataset.load(
             'SIDD',
             namespace='huizheng',
+            subset_name='default',
             split='test',
             download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
         self.dataset_train = SiddImageDenoisingDataset(

From 3b1f1a0252d4fee7ecd15ac8dc7c04ec0535add0 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 18 Oct 2022 15:58:33 +0800
Subject: [PATCH 063/129] [to #42322933] Add GPT3 tensor parallel inference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加基于 Megatron-v3 的 GPT3 tensor 并行的推理代码
复用 DistributedPipeline 与 megatron-util
适用模型：1.3B/2.7B/13B 参数的 GPT-3 预训练生成模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10416721
---
 modelscope/metainfo.py                        |    2 +
 modelscope/models/nlp/gpt3/__init__.py        |    2 +
 .../models/nlp/gpt3/configuration_gpt3.py     |   90 +-
 .../models/nlp/gpt3/distributed_gpt3.py       | 1057 +++++++++++++++++
 modelscope/models/nlp/gpt3/modeling_gpt3.py   |   54 +-
 modelscope/models/nlp/gpt3/tokenizer_gpt3.py  |   69 ++
 .../nlp/distributed_gpt3_pipeline.py          |   54 +
 modelscope/preprocessors/__init__.py          |    2 +
 modelscope/preprocessors/nlp/__init__.py      |    2 +
 modelscope/preprocessors/nlp/nlp_base.py      |   35 +
 modelscope/utils/nlp/distributed.py           |    5 +-
 tests/pipelines/test_gpt3_text_generation.py  |   58 +
 12 files changed, 1388 insertions(+), 42 deletions(-)
 create mode 100644 modelscope/models/nlp/gpt3/distributed_gpt3.py
 create mode 100644 modelscope/models/nlp/gpt3/tokenizer_gpt3.py
 create mode 100644 modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
 create mode 100644 tests/pipelines/test_gpt3_text_generation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index e4a26303..2dbff948 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -227,6 +227,7 @@ class Pipelines(object):
     zero_shot_classification = 'zero-shot-classification'
     text_error_correction = 'text-error-correction'
     plug_generation = 'plug-generation'
+    gpt3_generation = 'gpt3-generation'
     faq_question_answering = 'faq-question-answering'
     conversational_text_to_sql = 'conversational-text-to-sql'
     table_question_answering_pipeline = 'table-question-answering-pipeline'
@@ -324,6 +325,7 @@ class Preprocessors(object):
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     text_gen_tokenizer = 'text-gen-tokenizer'
     text2text_gen_preprocessor = 'text2text-gen-preprocessor'
+    text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer'
     text2text_translate_preprocessor = 'text2text-translate-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
diff --git a/modelscope/models/nlp/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py
index 076a0c6b..9cae8cc8 100644
--- a/modelscope/models/nlp/gpt3/__init__.py
+++ b/modelscope/models/nlp/gpt3/__init__.py
@@ -7,11 +7,13 @@ if TYPE_CHECKING:
     from .configuration_gpt3 import GPT3Config
     from .modeling_gpt3 import GPT3Model
     from .gpt3_for_text_generation import GPT3ForTextGeneration
+    from .tokenizer_gpt3 import JiebaBPETokenizer
 else:
     _import_structure = {
         'configuration_gpt3': ['GPT3Config'],
         'modeling_gpt3': ['GPT3Model'],
         'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
+        'tokenizer_gpt3': ['JiebaBPETokenizer'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration_gpt3.py
index d5a054fd..66e8b836 100644
--- a/modelscope/models/nlp/gpt3/configuration_gpt3.py
+++ b/modelscope/models/nlp/gpt3/configuration_gpt3.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
@@ -21,25 +22,48 @@ logger = logging.get_logger(__name__)
 
 class GPT3Config(PretrainedConfig):
 
-    model_type = 'gpt'
-
-    def __init__(self,
-                 vocab_size=25600,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act='gelu',
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=2048,
-                 type_vocab_size=2,
-                 layernorm_epsilon=1e-12,
-                 **kwargs):
+    model_type = 'gpt3'
+
+    def __init__(
+            self,
+            vocab_size=25600,
+            hidden_size=768,
+            ffn_hidden_size=None,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act='gelu',
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=2048,
+            type_vocab_size=2,
+            layernorm_epsilon=1e-12,
+            bias_gelu_fusion=True,
+            fp32_residual_connection=False,
+            sequence_parallel=False,
+            fp16=False,
+            bf16=False,
+            apply_query_key_layer_scaling=True,
+            attention_softmax_in_fp32=False,
+            kv_channels=None,
+            masked_softmax_fusion=True,
+            attention_dropout=0.1,
+            bias_dropout_fusion=True,
+            apply_residual_connection_post_layernorm=False,
+            hidden_dropout=0.1,
+            init_method_std=0.02,
+            # generate
+            eod_id=7,
+            tokens_to_generate=100,
+            top_k=0,
+            top_p=0.9,
+            **kwargs):
         super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs)
 
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
+        self.ffn_hidden_size = 4 * hidden_size \
+            if ffn_hidden_size is None else ffn_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.hidden_act = hidden_act
@@ -49,3 +73,39 @@ class GPT3Config(PretrainedConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.layernorm_epsilon = layernorm_epsilon
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.fp32_residual_connection = fp32_residual_connection
+        self.sequence_parallel = sequence_parallel
+        self.fp16 = fp16
+        self.bf16 = bf16
+        assert not (fp16 and bf16)
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        if kv_channels is None:
+            assert hidden_size % num_attention_heads == 0
+            self.kv_channels = hidden_size // num_attention_heads
+        self.masked_softmax_fusion = masked_softmax_fusion
+        self.attention_dropout = attention_dropout
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.apply_residual_connection_post_layernorm = \
+            apply_residual_connection_post_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.init_method_std = init_method_std
+        self.eod_id = eod_id
+        self.tokens_to_generate = tokens_to_generate
+        self.top_k = top_k
+        self.top_p = top_p
+
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        self.no_persist_layer_norm = \
+            TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11)
+
+    @property
+    def params_dtype(self):
+        if self.fp16:
+            return torch.half
+        elif self.bf16:
+            return torch.bfloat16
+        else:
+            return torch.float
diff --git a/modelscope/models/nlp/gpt3/distributed_gpt3.py b/modelscope/models/nlp/gpt3/distributed_gpt3.py
new file mode 100644
index 00000000..a0091259
--- /dev/null
+++ b/modelscope/models/nlp/gpt3/distributed_gpt3.py
@@ -0,0 +1,1057 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from megatron import mpu
+from megatron.global_vars import get_global_memory_buffer, set_global_variables
+from megatron.model import (AttnMaskType, Float16Module, LayerNorm,
+                            bias_gelu_impl)
+from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_utils import PreTrainedModel
+
+from modelscope.models import TorchModel
+from modelscope.models.nlp.gpt3 import GPT3Config
+from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.nlp.load_checkpoint import pre_load
+from modelscope.utils.torch_utils import set_random_seed_mpu
+
+
+class GPT3ParallelMLP(nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method):
+        super().__init__()
+
+        # Project to 4h.
+        self.dense_h_to_4h = mpu.ColumnParallelLinearV3(
+            config,
+            config.hidden_size,
+            config.ffn_hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True)
+
+        self.bias_gelu_fusion = config.bias_gelu_fusion
+        self.activation_func = F.gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = mpu.RowParallelLinearV3(
+            config,
+            config.ffn_hidden_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(
+            hidden_states)
+
+        if self.bias_gelu_fusion:
+            intermediate_parallel = \
+                bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            intermediate_parallel = \
+                self.activation_func(intermediate_parallel + bias_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+
+
+class GPT3Embedding(nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self, config, init_method):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.init_method = init_method
+
+        # Word embeddings (parallel).
+        self.word_embeddings = mpu.VocabParallelEmbedding(
+            config.vocab_size, self.hidden_size, init_method=self.init_method)
+
+        # Position embedding (serial).
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                self.hidden_size)
+        # Initialize the position embeddings.
+        self.init_method(self.position_embeddings.weight)
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.sequence_parallel = config.sequence_parallel
+        # Embeddings dropout
+        self.embedding_dropout = nn.Dropout(config.hidden_dropout)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+
+    def forward(self, input_ids, position_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        if self.sequence_parallel:
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+            with mpu.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+
+class NoopTransformerLayer(nn.Module):
+
+    def __init__(self, layer_number):
+        super().__init__()
+        self.layer_number = layer_number
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                encoder_output=None,
+                enc_dec_attn_mask=None,
+                inference_params=None):
+        return hidden_states.clone()
+
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+class GPT3CoreAttention(nn.Module):
+
+    def __init__(self,
+                 config,
+                 layer_number,
+                 attn_mask_type=AttnMaskType.padding):
+        super().__init__()
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.sequence_parallel = config.sequence_parallel
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(projection_size,
+                                                    world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            config.num_attention_heads, world_size)
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16, self.attn_mask_type,
+            config.masked_softmax_fusion, attention_mask_func,
+            self.attention_softmax_in_fp32, coeff)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1), query_layer.size(2),
+                       query_layer.size(0), key_layer.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = get_global_memory_buffer().get_tensor(
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype, 'mpu')
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),  # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor))
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.sequence_parallel:
+            with mpu.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1), value_layer.size(2),
+                       query_layer.size(0), value_layer.size(3))
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(
+            value_layer.size(0), output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class GPT3ParallelAttention(nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method,
+                 layer_number):
+        super().__init__()
+        self.layer_number = max(1, layer_number)
+        self.params_dtype = config.params_dtype
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_attention_head = mpu.divide(
+            projection_size, config.num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
+            config.num_attention_heads, world_size)
+
+        # Strided linear layer.
+        self.query_key_value = mpu.ColumnParallelLinearV3(
+            config,
+            config.hidden_size,
+            3 * projection_size,
+            gather_output=False,
+            init_method=init_method)
+
+        self.core_attention = GPT3CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = mpu.RowParallelLinearV3(
+            config,
+            projection_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True)
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size):
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            dtype=self.params_dtype,
+            device=torch.cuda.current_device())
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        if inference_params:
+            if self.layer_number not in inference_params.key_value_memory_dict:
+                inf_max_seq_len = inference_params.max_sequence_len
+                inf_max_batch_size = inference_params.max_batch_size
+                inference_key_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_value_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_params.key_value_memory_dict[self.layer_number] = (
+                    inference_key_memory, inference_value_memory)
+            else:
+                inference_key_memory, inference_value_memory = \
+                    inference_params.key_value_memory_dict[self.layer_number]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        (query_layer, key_layer,
+         value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if inference_params:
+            batch_start = inference_params.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+            # Copy key and values.
+            inference_key_memory[sequence_start:sequence_end,
+                                 batch_start:batch_end, ...] = key_layer
+            inference_value_memory[sequence_start:sequence_end,
+                                   batch_start:batch_end, ...] = value_layer
+            key_layer = inference_key_memory[:sequence_end,
+                                             batch_start:batch_end, ...]
+            value_layer = inference_value_memory[:sequence_end,
+                                                 batch_start:batch_end, ...]
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer,
+                                            value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer)
+
+        return output, bias
+
+
+class nullcontext:
+
+    def __init__(self, enter_result=None):
+        self.enter_result = enter_result
+
+    def __enter__(self):
+        return self.enter_result
+
+    def __exit__(self, *excinfo):
+        pass
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x: torch.Tensor, bias: torch.Tensor,
+                                 residual: torch.Tensor,
+                                 prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x: torch.Tensor, bias: torch.Tensor,
+                                     residual: torch.Tensor,
+                                     prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+class GPT3ParallelTransformerLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method,
+                 layer_number):
+
+        super().__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm \
+            = config.apply_residual_connection_post_layernorm
+
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=config.no_persist_layer_norm,
+            sequence_parallel=config.sequence_parallel)
+
+        # Self attention.
+        self.self_attention = GPT3ParallelAttention(config, init_method,
+                                                    output_layer_init_method,
+                                                    layer_number)
+        self.hidden_dropout = config.hidden_dropout
+        self.bias_dropout_fusion = config.bias_dropout_fusion
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            no_persist_layer_norm=config.no_persist_layer_norm,
+            sequence_parallel=config.sequence_parallel)
+
+        # MLP
+        self.mlp = GPT3ParallelMLP(config, init_method,
+                                   output_layer_init_method)
+
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1
+                                          and TORCH_MINOR >= 10)
+        self.bias_dropout_add_exec_handler = \
+            nullcontext if use_nvfuser else torch.enable_grad
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(
+                layernorm_output,
+                attention_mask,
+                inference_params=inference_params)
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        if self.bias_dropout_fusion:
+            if self.training:
+                bias_dropout_add_func = bias_dropout_add_fused_train
+            else:
+                bias_dropout_add_func = bias_dropout_add_fused_inference
+        else:
+            bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+        with self.bias_dropout_add_exec_handler():
+            layernorm_input = bias_dropout_add_func(
+                attention_output, attention_bias.expand_as(residual), residual,
+                self.hidden_dropout)
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        with self.bias_dropout_add_exec_handler():
+            output = bias_dropout_add_func(mlp_output,
+                                           mlp_bias.expand_as(residual),
+                                           residual, self.hidden_dropout)
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = mpu.make_viewless_tensor(
+            inp=output, requires_grad=output.requires_grad, keep_graph=True)
+
+        return output
+
+
+class GPT3ParallelTransformer(nn.Module):
+    """Transformer class."""
+
+    def __init__(self,
+                 config,
+                 init_method,
+                 output_layer_init_method,
+                 post_layer_norm=True,
+                 pre_process=True,
+                 post_process=True):
+        super().__init__()
+
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_tensor = None
+
+        self.sequence_parallel = config.sequence_parallel
+
+        # Number of layers.
+        self.num_layers = config.num_hidden_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GPT3ParallelTransformerLayer(config, init_method,
+                                                output_layer_init_method,
+                                                layer_number)
+
+        if self.num_layers == 0:
+            self.num_layers = 1
+            self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
+        else:
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = LayerNorm(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                no_persist_layer_norm=config.no_persist_layer_norm,
+                sequence_parallel=config.sequence_parallel)
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states: [s, b, h]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = mpu.make_viewless_tensor(
+            hidden_states,
+            requires_grad=True,
+            keep_graph=True,
+        )
+
+        if self.sequence_parallel:
+            rng_context = mpu.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        with rng_context:
+            # Forward pass.
+            for index in range(self.num_layers):
+                layer = self._get_layer(index)
+                hidden_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    inference_params=inference_params)
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class GPT3TransformerLanguageModel(nn.Module):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self, config, init_method, output_layer_init_method):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.init_method = init_method
+        self.encoder_hidden_state = None
+
+        # Embeddings.
+        self.embedding = GPT3Embedding(config, self.init_method)
+
+        # Transformer.
+        self.encoder = GPT3ParallelTransformer(
+            config,
+            self.init_method,
+            output_layer_init_method,
+        )
+
+    def forward(self,
+                enc_input_ids,
+                enc_position_ids,
+                enc_attn_mask,
+                inference_params=None,
+                enc_hidden_states=None):
+
+        # Encoder embedding.
+        encoder_input = self.embedding(enc_input_ids, enc_position_ids)
+
+        # Run encoder.
+        if enc_hidden_states is None:
+            if self.encoder is not None:
+                encoder_output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    inference_params=inference_params)
+            else:
+                encoder_output = self.encoder_hidden_state
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+
+        return encoder_output
+
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class GPT3Model(PreTrainedModel):
+
+    config_class = GPT3Config
+
+    def __init__(self, config, parallel_output=False):
+        super().__init__(config)
+
+        self.parallel_output = parallel_output
+
+        self.language_model = GPT3TransformerLanguageModel(
+            config, init_method_normal(config.init_method_std),
+            scaled_init_method_normal(config.init_method_std,
+                                      config.num_hidden_layers))
+
+    def word_embeddings_weight(self):
+        return self.language_model.embedding.word_embeddings.weight
+
+    @staticmethod
+    def build_attention_mask_and_position_ids(tokens):
+        seq_length = tokens.size(1)
+        attention_mask = torch.tril(
+            torch.ones((1, 1, seq_length, seq_length),
+                       dtype=torch.long,
+                       device=tokens.device))
+        attention_mask = (attention_mask < 0.5)
+
+        position_ids = torch.arange(
+            seq_length, dtype=torch.long, device=tokens.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(tokens)
+
+        return attention_mask, position_ids
+
+    def forward(self,
+                input_ids,
+                attention_mask=None,
+                position_ids=None,
+                inference_params=None,
+                **kwargs):
+        if attention_mask is None and position_ids is None:
+            attention_mask, position_ids = \
+                self.build_attention_mask_and_position_ids(input_ids)
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params)
+
+        logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply(
+            lm_output, self.word_embeddings_weight(), None, False, True,
+            self.config.sequence_parallel)
+        # Gather if needed.
+
+        output = logits_parallel
+        if not self.parallel_output:
+            output = mpu.gather_from_model_parallel_region(logits_parallel)
+        return output.transpose(0, 1).contiguous()
+
+
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf."""
+
+    filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(filter_, float('-Inf'))
+
+
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf."""
+
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+    # Filteration based on the cumulative sum.
+    filter_ = cumulative_probs > top_p
+    # This shift by 1 is weird and I cannot justify it. This existed
+    # in the original implementation:
+    #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+    # and I guess it is needed so keeping it for now.
+    filter_[:, 1:] = filter_[:, :-1].clone()
+    # Make sure we at least have one token to select from.
+    filter_[..., 0] = 0
+
+    # Fill in the filtered part
+    filter_ = filter_.scatter(1, sorted_indices, filter_)
+    logits.masked_fill_(filter_, float('-Inf'))
+
+
+def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
+    """ Sample and generate a token.
+    Note: logits has the dimension [b, v] where b is the batch size
+          and v is the vocabulary size.
+    If vocab_size is provided, we will make sure the sample that is
+    generated is in [0, vocab-size). This will avoid out of vocabulary
+    generations due to padding.
+    """
+
+    # Check logits for consistency.
+    assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
+    assert logits.type() == 'torch.cuda.FloatTensor', \
+        'input logits should be floats.'
+
+    # Greedy is just simple argmax.
+    if top_k == 1:
+        assert top_p == 0.0, 'cannot set both greedy and top-p samplings.'
+        samples = torch.argmax(logits, dim=-1)
+
+    # Top-k or top-p sampling.
+    else:
+        # Clone so we do not modify the inputs,
+        logits = logits.clone()
+        # Apply temperature in place.
+        if temperature != 1.0:
+            logits.div_(temperature)
+
+        if top_k > 1:
+            assert top_p == 0.0, 'cannot set both top-k and top-p samplings.'
+            assert top_k <= logits.size(1), 'top-k is larger than logit size.'
+            if vocab_size:
+                assert top_k < vocab_size, 'top-k is larger than vocab size.'
+            modify_logits_for_top_k_filtering(logits, top_k)
+
+        elif top_p > 0.0:
+            assert top_p <= 1.0, 'top-p should be in (0, 1].'
+            modify_logits_for_top_p_filtering(logits, top_p)
+
+        # After filtering, we need to recalculate the distribution.
+        probs = logits.softmax(dim=-1)
+        samples = torch.multinomial(probs, num_samples=1).view(-1)
+
+    # If vocab size is provided, make sure the samples are in
+    # in the range [0, vocab-size).
+    if vocab_size:
+        samples = torch.clamp(samples, min=0, max=(vocab_size - 1))
+
+    return samples
+
+
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    def __init__(self, max_batch_size, max_sequence_len):
+        """Note that offsets are set to zero and we always set the
+        flag to allocate memory. After the first call, make sure to
+        set this flag to False."""
+        self.max_sequence_len = max_sequence_len
+        self.max_batch_size = max_batch_size
+        self.sequence_len_offset = 0
+        self.batch_size_offset = 0
+        self.key_value_memory_dict = {}
+
+    def swap_key_value_dict(self, batch_idx):
+        'swap between batches'
+        if len(self.key_value_memory_dict) == 0:
+            raise ValueError('should not swap when dict in empty')
+
+        for layer_number in self.key_value_memory_dict.keys():
+            inference_key_memory, inference_value_memory = self.key_value_memory_dict[
+                layer_number]
+            assert len(batch_idx) == inference_key_memory.shape[
+                1]  # make sure batch size is the same
+            new_inference_key_memory = inference_key_memory[:, batch_idx]
+            new_inference_value_memory = inference_value_memory[:, batch_idx]
+            self.key_value_memory_dict[layer_number] = (
+                new_inference_key_memory, new_inference_value_memory)
+
+
+class DistributedGPT3(TorchModel):
+
+    def __init__(self,
+                 model_dir,
+                 rank,
+                 path_load_tag='model',
+                 *args,
+                 **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        initialize_distributed(rank, mpu, kwargs['world_size'],
+                               kwargs['model_parallel_size'],
+                               kwargs['master_ip'], kwargs['master_port'])
+        seed = 0 if 'seed' not in kwargs else kwargs['seed']
+        set_random_seed_mpu(seed)
+        set_global_variables()
+
+        self.config = GPT3Config.from_pretrained(model_dir)
+        # Build model.
+        model = GPT3Model(self.config)
+
+        for param in model.parameters():
+            mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
+
+        # GPU allocation.
+        model.cuda(torch.cuda.current_device())
+
+        # Fp16 conversion.
+        if self.config.fp16 or self.config.bf16:
+            model = Float16Module(model, self.config)
+
+        self.dist_model = model
+        load_model = pre_load(mpu, model_dir, tag=path_load_tag)
+        self.dist_model.load_state_dict(load_model)
+
+        self.inference_params = None
+
+    def forward_step(self, tokens, attention_mask, position_ids):
+        logits = self.dist_model(
+            tokens,
+            attention_mask,
+            position_ids,
+            inference_params=self.inference_params)
+        self.inference_params.sequence_len_offset += tokens.size(1)
+        return logits
+
+    def generate(self,
+                 tokens,
+                 temperature=1.0,
+                 use_eod_token_for_early_termination=True,
+                 stop_on_double_eol=False,
+                 stop_on_eol=False):
+        lengths = torch.tensor([tokens.size(1)], device=tokens.device)
+        pads = torch.ones(
+            1, self.config.tokens_to_generate,
+            device=tokens.device).long() * self.config.eod_id
+        tokens = torch.cat((tokens, pads), dim=-1)
+
+        batch_size = tokens.size(0)
+        min_prompt_length = lengths.min().item()
+        max_sequence_length = tokens.size(1)
+        max_sequence_length = min(max_sequence_length,
+                                  self.config.max_position_embeddings)
+
+        # If the context is too big, this happens
+        if min_prompt_length >= max_sequence_length:
+            raise ValueError('context length + tokens_to_generate too large')
+
+        # Initialize inference parameters.
+        self.inference_params = InferenceParams(batch_size,
+                                                max_sequence_length)
+
+        # Added termination_id to support the case that we want to terminate the
+        # generation once that id is generated.
+        termination_id = self.config.eod_id
+
+        # Whether we have reached a termination id.
+        is_generation_done = torch.zeros(
+            batch_size, dtype=torch.uint8, device=torch.cuda.current_device())
+
+        # =============
+        # Run infernece
+        # =============
+
+        with torch.no_grad():
+            attention_mask, position_ids = \
+                GPT3Model.build_attention_mask_and_position_ids(tokens)
+            prev_context_length = 0
+            for context_length in range(min_prompt_length,
+                                        max_sequence_length):
+
+                # Pick the slice that we need to pass through the network.
+                tokens2use = tokens[:, prev_context_length:context_length]
+                positions2use = position_ids[:, prev_context_length:
+                                             context_length]
+                attention_mask2use = attention_mask[
+                    ..., prev_context_length:context_length, :context_length]
+
+                # logits will be meanigful only in the last pipeline stage.
+                logits = self.forward_step(tokens2use, attention_mask2use,
+                                           positions2use)
+
+                # Sample.
+                last_token_logits = logits[:, -1, :]
+                new_sample = sample(
+                    last_token_logits,
+                    top_k=self.config.top_k,
+                    top_p=self.config.top_p,
+                    temperature=temperature,
+                    vocab_size=self.config.vocab_size)
+
+                # If a prompt length is smaller or equal th current context
+                # length, it means we have started generating tokens
+                started = lengths <= context_length
+                # Update the tokens.
+                tokens[started, context_length] = new_sample[started]
+
+                # Update the context length for the next token generation.
+                prev_context_length = context_length
+
+                # instead tokenization should be in the inference loop so stop sequences can be used
+                if stop_on_double_eol:
+                    hit_double_eol = (new_sample
+                                      == 628).byte() & started.byte()
+                    hit_two_eols = (new_sample == 198).byte() & (
+                        tokens[:, context_length - 1]
+                        == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_two_eols
+                elif stop_on_eol:
+                    hit_double_eol = (new_sample
+                                      == 628).byte() & started.byte()
+                    hit_eol = (new_sample == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_eol
+                else:
+                    done_token = (new_sample == termination_id).byte() & \
+                        started.byte()
+
+                is_generation_done = is_generation_done | done_token
+                done = torch.all(is_generation_done)
+
+                if use_eod_token_for_early_termination and done:
+                    break
+
+        tokens = tokens[:, :(context_length + 1)]
+        return tokens
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py
index ade36e36..2c23f5db 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py
@@ -19,8 +19,7 @@ from typing import Optional, Union
 
 import addict
 import torch
-from torch.nn import (CrossEntropyLoss, Dropout, Embedding, LayerNorm, Linear,
-                      Module, Softmax)
+from torch import nn
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
@@ -28,7 +27,7 @@ from modelscope.utils.constant import ModelFile
 from .configuration_gpt3 import GPT3Config
 
 
-class GPT3SelfAttention(Module):
+class GPT3SelfAttention(nn.Module):
     """Parallel self-attention layer abstract class.
 
     Self-attention layer takes input with size [s, b, h]
@@ -44,13 +43,15 @@ class GPT3SelfAttention(Module):
         self.hidden_size_per_attention_head = \
             self.hidden_size // self.num_attention_heads
 
-        self.query_key_value = Linear(self.hidden_size, 3 * self.hidden_size)
-        self.softmax = Softmax(dim=-1)
-        self.attention_dropout = Dropout(config.attention_probs_dropout_prob)
+        self.query_key_value = nn.Linear(self.hidden_size,
+                                         3 * self.hidden_size)
+        self.softmax = nn.Softmax(dim=-1)
+        self.attention_dropout = nn.Dropout(
+            config.attention_probs_dropout_prob)
 
         # Output.
-        self.dense = Linear(self.hidden_size, self.hidden_size)
-        self.output_dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def _transpose_for_scores(self, tensor):
         """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
@@ -133,7 +134,7 @@ class GPT3SelfAttention(Module):
         return output
 
 
-class GPT3MLP(Module):
+class GPT3MLP(nn.Module):
     """MLP.
 
     MLP will take the input with h hidden state, project it to 4*h
@@ -146,12 +147,12 @@ class GPT3MLP(Module):
 
         hidden_size = config.hidden_size
         # Project to 4h.
-        self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size)
+        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
         self.activation_func = F.gelu
         # Project back to h.
-        self.dense_4h_to_h = Linear(4 * hidden_size, hidden_size)
+        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
 
-        self.dropout = Dropout(config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states):
 
@@ -164,7 +165,7 @@ class GPT3MLP(Module):
         return output
 
 
-class GPT3TransformerLayer(Module):
+class GPT3TransformerLayer(nn.Module):
     """A single transformer layer.
 
     Transformer layer takes input with size [s, b, h] and returns an
@@ -175,14 +176,14 @@ class GPT3TransformerLayer(Module):
         super().__init__()
 
         # Layernorm on the input data.
-        self.input_layernorm = LayerNorm(
+        self.input_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layernorm_epsilon)
 
         # Self attention.
         self.attention = GPT3SelfAttention(config)
 
         # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNorm(
+        self.post_attention_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layernorm_epsilon)
 
         # MLP
@@ -208,7 +209,7 @@ class GPT3TransformerLayer(Module):
         return output
 
 
-class GPT3Transformer(Module):
+class GPT3Transformer(nn.Module):
     """Transformer class."""
 
     def __init__(self, config):
@@ -223,7 +224,7 @@ class GPT3Transformer(Module):
             [GPT3TransformerLayer(config) for _ in range(self.num_layers)])
 
         # Final layer norm before output.
-        self.final_layernorm = LayerNorm(
+        self.final_layernorm = nn.LayerNorm(
             config.hidden_size, eps=config.layernorm_epsilon)
 
     def _get_layer(self, layer_number):
@@ -242,7 +243,7 @@ class GPT3Transformer(Module):
         return hidden_states
 
 
-class GPT3TransformerLanguageModel(Module):
+class GPT3TransformerLanguageModel(nn.Module):
     """Transformer language model.
 
     Arguments:
@@ -259,10 +260,11 @@ class GPT3TransformerLanguageModel(Module):
         super().__init__()
 
         # Embeddings.
-        self.word_embeddings = Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = Embedding(config.max_position_embeddings,
-                                             config.hidden_size)
-        self.embedding_dropout = Dropout(config.hidden_dropout_prob)
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.embedding_dropout = nn.Dropout(config.hidden_dropout_prob)
 
         # Transformer.
         self.transformer = GPT3Transformer(config)
@@ -286,19 +288,19 @@ class GPT3Model(PreTrainedModel):
 
     def _init_weights(self, module):
         """Initialize the weights"""
-        if isinstance(module, Linear):
+        if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(
                 mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, Embedding):
+        elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(
                 mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, LayerNorm):
+        elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
@@ -325,7 +327,7 @@ class GPT3Model(PreTrainedModel):
         logits = self.language_model(input_ids, attention_mask, position_ids)
         loss = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
+            loss_fct = nn.CrossEntropyLoss()
             loss = loss_fct(
                 logits.view(-1, self.config.vocab_size), labels.view(-1))
         return addict.Dict(loss=loss, logits=logits)
diff --git a/modelscope/models/nlp/gpt3/tokenizer_gpt3.py b/modelscope/models/nlp/gpt3/tokenizer_gpt3.py
new file mode 100644
index 00000000..5780ddbd
--- /dev/null
+++ b/modelscope/models/nlp/gpt3/tokenizer_gpt3.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tokenizers import Tokenizer
+
+
+class JiebaBPETokenizer:
+    """SentencePiece BPE tokenizer with Jieba integration"""
+
+    def __init__(self, tokenizer_json_file):
+        self.name = 'Jieba BPE Tokenizer'
+
+        self.tokenizer = Tokenizer.from_file(tokenizer_json_file)
+        self.eod_id = self.tokenizer.token_to_id('<|endoftext|>')
+        try:
+            import jieba
+        except ImportError:
+            raise ImportError(
+                'You need to install rjieba to use JiebaTokenizer. '
+                'See https://pypi.org/project/rjieba/ for installation.')
+        self.jieba = jieba
+        self.new_line = self.vocab['\n']
+        self.sep_token = self.vocab['<sep>']
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.get_vocab_size(with_added_tokens=True)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.get_vocab(with_added_tokens=True)
+
+    @property
+    def inv_vocab(self):
+        vocab = self.vocab
+        inv_vocab = dict()
+        for key, val in vocab.items():
+            inv_vocab[val] = key
+        return inv_vocab
+
+    def tokenize(self, text, is_code=False):
+        """
+        """
+        if not is_code:
+            seg_list = [x for x in self.jieba.cut(text)]
+            return self.tokenizer.encode(
+                seg_list, is_pretokenized=True, add_special_tokens=True).ids
+        else:
+            return self.tokenizer.encode(
+                text, is_pretokenized=False, add_special_tokens=True).ids
+
+    def detokenize(self, token_ids):
+        text = self.tokenizer.decode(token_ids, skip_special_tokens=False)
+        return text
+
+    @property
+    def eod(self):
+        return self.eod_id
diff --git a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
new file mode 100644
index 00000000..325d3303
--- /dev/null
+++ b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.nlp.gpt3.distributed_gpt3 import DistributedGPT3
+from modelscope.pipelines.base import DistributedPipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import TextGenerationJiebaPreprocessor
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_generation, module_name=Pipelines.gpt3_generation)
+class DistributedGPT3Pipeline(DistributedPipeline):
+    """This class is used to instantiate the gpt3 model.
+    """
+
+    model = None
+
+    def __init__(self, model, preprocessor=None, **kwargs):
+        if preprocessor is None:
+            preprocessor = TextGenerationJiebaPreprocessor(model)
+        super().__init__(model, preprocessor=preprocessor, **kwargs)
+        assert hasattr(preprocessor, 'tokenizer')
+
+    @classmethod
+    def _instantiate_one(cls, rank, model_dir, **kwargs):
+        cls.model = DistributedGPT3(model_dir, rank, **kwargs)
+        cls.model.eval()
+
+    @classmethod
+    def _forward_one(cls, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        tokens = inputs['inputs']['input_ids'].cuda(
+            torch.cuda.current_device())
+        return cls.model.generate(tokens)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        from modelscope.outputs import OutputKeys
+        return {
+            OutputKeys.TEXT:
+            self.preprocessor.tokenizer.detokenize(inputs[0].tolist())
+        }
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 43fa64a7..f7defd92 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor,
         SentencePiecePreprocessor,
     )
     from .space import (DialogIntentPredictionPreprocessor,
@@ -72,6 +73,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'TextGenerationJiebaPreprocessor',
             'SentencePiecePreprocessor',
         ],
         'space': [
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index a753fe6c..f7478329 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
         Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
         ZeroShotClassificationPreprocessor,
+        TextGenerationJiebaPreprocessor,
         SentencePiecePreprocessor,
     )
 
@@ -42,6 +43,7 @@ else:
             'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
+            'TextGenerationJiebaPreprocessor',
             'SentencePiecePreprocessor',
         ],
         'text_error_correction': [
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 3d708634..267dbb8c 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -494,6 +494,41 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
         }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
+class TextGenerationJiebaPreprocessor(Preprocessor):
+    """The jieba tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
+        super().__init__(*args, **kwargs)
+        self.tokenizer = JiebaBPETokenizer(
+            osp.join(model_dir, 'tokenizer.json'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+        import torch
+
+        return {
+            'input_ids':
+            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
+        }
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp,
     module_name=Preprocessors.word_segment_text_to_label_preprocessor)
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
index 2b590a10..53332c0f 100755
--- a/modelscope/utils/nlp/distributed.py
+++ b/modelscope/utils/nlp/distributed.py
@@ -35,7 +35,10 @@ def initialize_distributed(rank, mpu, world_size, model_parallel_size,
     init_method = 'tcp://'
     init_method += master_ip + ':' + master_port
     torch.distributed.init_process_group(
-        backend='nccl', world_size=8, rank=rank, init_method=init_method)
+        backend='nccl',
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
     # Set the model-parallel communicators.
     mpu.initialize_model_parallel(model_parallel_size)
 
diff --git a/tests/pipelines/test_gpt3_text_generation.py b/tests/pipelines/test_gpt3_text_generation.py
new file mode 100644
index 00000000..413b5874
--- /dev/null
+++ b/tests/pipelines/test_gpt3_text_generation.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TextGPT3GenerationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        # please make sure this local path exists.
+        self.model_id_1_3B = 'damo/nlp_gpt3_text-generation_1.3B'
+        self.model_id_2_7B = 'damo/nlp_gpt3_text-generation_2.7B'
+        self.model_id_13B = 'damo/nlp_gpt3_text-generation_13B'
+        self.model_dir_13B = snapshot_download(self.model_id_13B)
+        self.input = '好的'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_gpt3_1_3B(self):
+        pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
+        print(pipe(self.input))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_gpt3_2_7B(self):
+        pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B)
+        print(pipe(self.input))
+
+    @unittest.skip('distributed gpt3 13B, skipped')
+    def test_gpt3_13B(self):
+        """ The model can be downloaded from the link on
+        TODO: add gpt3 checkpoint link
+        After downloading, you should have a gpt3 model structure like this:
+        nlp_gpt3_text-generation_13B
+            |_ config.json
+            |_ configuration.json
+            |_ tokenizer.json
+            |_ model <-- an empty directory
+
+        Model binaries shall be downloaded separately to populate the model directory, so that
+        the model directory would contain the following binaries:
+            |_ model
+                |_ mp_rank_00_model_states.pt
+                |_ mp_rank_01_model_states.pt
+                |_ mp_rank_02_model_states.pt
+                |_ mp_rank_03_model_states.pt
+                |_ mp_rank_04_model_states.pt
+                |_ mp_rank_05_model_states.pt
+                |_ mp_rank_06_model_states.pt
+                |_ mp_rank_07_model_states.pt
+        """
+        pipe = pipeline(Tasks.text_generation, model=self.model_dir_13B)
+        print(pipe(self.input))
+
+
+if __name__ == '__main__':
+    unittest.main()

From cb570d586cb5f4a467de9aad1e058e3cd3276518 Mon Sep 17 00:00:00 2001
From: "shuying.shu" <shuying.shu@alibaba-inc.com>
Date: Tue, 18 Oct 2022 16:10:10 +0800
Subject: [PATCH 064/129] add referring video object segmentation pipeline     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10400324

---
 ...g_video_object_segmentation_test_video.mp4 |   3 +
 modelscope/metainfo.py                        |   2 +
 modelscope/models/cv/__init__.py              |   3 +-
 .../__init__.py                               |  23 +
 .../model.py                                  |  65 ++
 .../utils/__init__.py                         |   4 +
 .../utils/backbone.py                         | 198 +++++
 .../utils/misc.py                             | 234 ++++++
 .../utils/mttr.py                             | 128 +++
 .../utils/multimodal_transformer.py           | 440 +++++++++++
 .../utils/position_encoding_2d.py             |  57 ++
 .../utils/postprocessing.py                   | 119 +++
 .../utils/segmentation.py                     | 137 ++++
 .../utils/swin_transformer.py                 | 731 ++++++++++++++++++
 modelscope/outputs.py                         |   6 +
 modelscope/pipelines/builder.py               |   3 +
 modelscope/pipelines/cv/__init__.py           |   4 +
 ...ring_video_object_segmentation_pipeline.py | 193 +++++
 modelscope/utils/constant.py                  |   3 +
 requirements/cv.txt                           |   2 +
 ...est_referring_video_object_segmentation.py |  56 ++
 21 files changed, 2410 insertions(+), 1 deletion(-)
 create mode 100644 data/test/videos/referring_video_object_segmentation_test_video.mp4
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/__init__.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/model.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/misc.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py
 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
 create mode 100644 modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
 create mode 100644 tests/pipelines/test_referring_video_object_segmentation.py

diff --git a/data/test/videos/referring_video_object_segmentation_test_video.mp4 b/data/test/videos/referring_video_object_segmentation_test_video.mp4
new file mode 100644
index 00000000..529595a5
--- /dev/null
+++ b/data/test/videos/referring_video_object_segmentation_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb
+size 2924691
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 2dbff948..fc18ead9 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -34,6 +34,7 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     resnet50_bert = 'resnet50-bert'
+    referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
     fer = 'fer'
     retinaface = 'retinaface'
     shop_segmentation = 'shop-segmentation'
@@ -203,6 +204,7 @@ class Pipelines(object):
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
     image_body_reshaping = 'flow-based-body-reshaping'
+    referring_video_object_segmentation = 'referring-video-object-segmentation'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index fd950f4c..64039863 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -12,7 +12,8 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_to_image_generation, image_to_image_translation,
                movie_scene_segmentation, object_detection,
                product_retrieval_embedding, realtime_object_detection,
-               salient_detection, shop_segmentation, super_resolution,
+               referring_video_object_segmentation, salient_detection,
+               shop_segmentation, super_resolution,
                video_single_object_tracking, video_summarization, virual_tryon)
 
 # yapf: enable
diff --git a/modelscope/models/cv/referring_video_object_segmentation/__init__.py b/modelscope/models/cv/referring_video_object_segmentation/__init__.py
new file mode 100644
index 00000000..58dbf7b0
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import MovieSceneSegmentation
+
+else:
+    _import_structure = {
+        'model': ['MovieSceneSegmentation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/referring_video_object_segmentation/model.py b/modelscope/models/cv/referring_video_object_segmentation/model.py
new file mode 100644
index 00000000..902a3416
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/model.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .utils import (MTTR, A2DSentencesPostProcess, ReferYoutubeVOSPostProcess,
+                    nested_tensor_from_videos_list)
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.referring_video_object_segmentation,
+    module_name=Models.referring_video_object_segmentation)
+class ReferringVideoObjectSegmentation(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+        self.cfg = Config.from_file(config_path)
+        self.model = MTTR(**self.cfg.model)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        params_dict = torch.load(model_path, map_location='cpu')
+        if 'model_state_dict' in params_dict.keys():
+            params_dict = params_dict['model_state_dict']
+        self.model.load_state_dict(params_dict, strict=True)
+
+        dataset_name = self.cfg.pipeline.dataset_name
+        if dataset_name == 'a2d_sentences' or dataset_name == 'jhmdb_sentences':
+            self.postprocessor = A2DSentencesPostProcess()
+        elif dataset_name == 'ref_youtube_vos':
+            self.postprocessor = ReferYoutubeVOSPostProcess()
+        else:
+            assert False, f'postprocessing for dataset: {dataset_name} is not supported'
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        return inputs
+
+    def inference(self, **kwargs):
+        window = kwargs['window']
+        text_query = kwargs['text_query']
+        video_metadata = kwargs['metadata']
+
+        window = nested_tensor_from_videos_list([window])
+        valid_indices = torch.arange(len(window.tensors))
+        if self._device_name == 'gpu':
+            valid_indices = valid_indices.cuda()
+        outputs = self.model(window, valid_indices, [text_query])
+        window_masks = self.postprocessor(
+            outputs, [video_metadata],
+            window.tensors.shape[-2:])[0]['pred_masks']
+        return window_masks
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs):
+        return inputs
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py b/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py
new file mode 100644
index 00000000..796bd6f4
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .misc import nested_tensor_from_videos_list
+from .mttr import MTTR
+from .postprocessing import A2DSentencesPostProcess, ReferYoutubeVOSPostProcess
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py b/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py
new file mode 100644
index 00000000..afa384c1
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py
@@ -0,0 +1,198 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from einops import rearrange
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+
+from .misc import NestedTensor, is_main_process
+from .swin_transformer import SwinTransformer3D
+
+
+class VideoSwinTransformerBackbone(nn.Module):
+    """
+    A wrapper which allows using Video-Swin Transformer as a temporal encoder for MTTR.
+    Check out video-swin's original paper at: https://arxiv.org/abs/2106.13230 for more info about this architecture.
+    Only the 'tiny' version of video swin was tested and is currently supported in our project.
+    Additionally, we slightly modify video-swin to make it output per-frame embeddings as required by MTTR (check our
+    paper's supplementary for more details), and completely discard of its 4th block.
+    """
+
+    def __init__(self, backbone_pretrained, backbone_pretrained_path,
+                 train_backbone, running_mode, **kwargs):
+        super(VideoSwinTransformerBackbone, self).__init__()
+        # patch_size is (1, 4, 4) instead of the original (2, 4, 4).
+        # this prevents swinT's original temporal downsampling so we can get per-frame features.
+        swin_backbone = SwinTransformer3D(
+            patch_size=(1, 4, 4),
+            embed_dim=96,
+            depths=(2, 2, 6, 2),
+            num_heads=(3, 6, 12, 24),
+            window_size=(8, 7, 7),
+            drop_path_rate=0.1,
+            patch_norm=True)
+        if backbone_pretrained and running_mode == 'train':
+            state_dict = torch.load(backbone_pretrained_path)['state_dict']
+            # extract swinT's kinetics-400 pretrained weights and ignore the rest (prediction head etc.)
+            state_dict = {
+                k[9:]: v
+                for k, v in state_dict.items() if 'backbone.' in k
+            }
+
+            # sum over the patch embedding weight temporal dim  [96, 3, 2, 4, 4] --> [96, 3, 1, 4, 4]
+            patch_embed_weight = state_dict['patch_embed.proj.weight']
+            patch_embed_weight = patch_embed_weight.sum(dim=2, keepdims=True)
+            state_dict['patch_embed.proj.weight'] = patch_embed_weight
+            swin_backbone.load_state_dict(state_dict)
+
+        self.patch_embed = swin_backbone.patch_embed
+        self.pos_drop = swin_backbone.pos_drop
+        self.layers = swin_backbone.layers[:-1]
+        self.downsamples = nn.ModuleList()
+        for layer in self.layers:
+            self.downsamples.append(layer.downsample)
+            layer.downsample = None
+        self.downsamples[
+            -1] = None  # downsampling after the last layer is not necessary
+
+        self.layer_output_channels = [
+            swin_backbone.embed_dim * 2**i for i in range(len(self.layers))
+        ]
+        self.train_backbone = train_backbone
+        if not train_backbone:
+            for parameter in self.parameters():
+                parameter.requires_grad_(False)
+
+    def forward(self, samples: NestedTensor):
+        vid_frames = rearrange(samples.tensors, 't b c h w -> b c t h w')
+
+        vid_embeds = self.patch_embed(vid_frames)
+        vid_embeds = self.pos_drop(vid_embeds)
+        layer_outputs = []  # layer outputs before downsampling
+        for layer, downsample in zip(self.layers, self.downsamples):
+            vid_embeds = layer(vid_embeds.contiguous())
+            layer_outputs.append(vid_embeds)
+            if downsample:
+                vid_embeds = rearrange(vid_embeds, 'b c t h w -> b t h w c')
+                vid_embeds = downsample(vid_embeds)
+                vid_embeds = rearrange(vid_embeds, 'b t h w c -> b c t h w')
+        layer_outputs = [
+            rearrange(o, 'b c t h w -> t b c h w') for o in layer_outputs
+        ]
+
+        outputs = []
+        orig_pad_mask = samples.mask
+        for l_out in layer_outputs:
+            pad_mask = F.interpolate(
+                orig_pad_mask.float(), size=l_out.shape[-2:]).to(torch.bool)
+            outputs.append(NestedTensor(l_out, pad_mask))
+        return outputs
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    Modified from DETR https://github.com/facebookresearch/detr
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer('weight', torch.ones(n))
+        self.register_buffer('bias', torch.zeros(n))
+        self.register_buffer('running_mean', torch.zeros(n))
+        self.register_buffer('running_var', torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class ResNetBackbone(nn.Module):
+    """
+    Modified from DETR https://github.com/facebookresearch/detr
+    ResNet backbone with frozen BatchNorm.
+    """
+
+    def __init__(self,
+                 backbone_name: str = 'resnet50',
+                 train_backbone: bool = True,
+                 dilation: bool = True,
+                 **kwargs):
+        super(ResNetBackbone, self).__init__()
+        backbone = getattr(torchvision.models, backbone_name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(),
+            norm_layer=FrozenBatchNorm2d)
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        return_layers = {
+            'layer1': '0',
+            'layer2': '1',
+            'layer3': '2',
+            'layer4': '3'
+        }
+        self.body = IntermediateLayerGetter(
+            backbone, return_layers=return_layers)
+        output_channels = 512 if backbone_name in ('resnet18',
+                                                   'resnet34') else 2048
+        self.layer_output_channels = [
+            output_channels // 8, output_channels // 4, output_channels // 2,
+            output_channels
+        ]
+
+    def forward(self, tensor_list: NestedTensor):
+        t, b, _, _, _ = tensor_list.tensors.shape
+        video_frames = rearrange(tensor_list.tensors,
+                                 't b c h w -> (t b) c h w')
+        padding_masks = rearrange(tensor_list.mask, 't b h w -> (t b) h w')
+        features_list = self.body(video_frames)
+        out = []
+        for _, f in features_list.items():
+            resized_padding_masks = F.interpolate(
+                padding_masks[None].float(),
+                size=f.shape[-2:]).to(torch.bool)[0]
+            f = rearrange(f, '(t b) c h w -> t b c h w', t=t, b=b)
+            resized_padding_masks = rearrange(
+                resized_padding_masks, '(t b) h w -> t b h w', t=t, b=b)
+            out.append(NestedTensor(f, resized_padding_masks))
+        return out
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+def init_backbone(backbone_name, **kwargs):
+    if backbone_name == 'swin-t':
+        return VideoSwinTransformerBackbone(**kwargs)
+    elif 'resnet' in backbone_name:
+        return ResNetBackbone(backbone_name, **kwargs)
+    assert False, f'error: backbone "{backbone_name}" is not supported'
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py b/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py
new file mode 100644
index 00000000..ecf34b8c
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py
@@ -0,0 +1,234 @@
+# Modified from DETR https://github.com/facebookresearch/detr
+# Misc functions.
+# Mostly copy-paste from torchvision references.
+
+import pickle
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+# needed due to empty tensor bug in pytorch and torchvision 0.5
+import torchvision
+from torch import Tensor
+
+if float(torchvision.__version__.split('.')[1]) < 7.0:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to('cuda')
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device='cuda')
+    size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(
+            torch.empty((max_size, ), dtype=torch.uint8, device='cuda'))
+    if local_size != max_size:
+        padding = torch.empty(
+            size=(max_size - local_size, ), dtype=torch.uint8, device='cuda')
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    """
+    This function receives a list of image tensors and returns a NestedTensor of the padded images, along with their
+    padding masks (true for padding areas, false otherwise).
+    """
+    max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+    batch_shape = [len(tensor_list)] + max_size
+    b, c, h, w = batch_shape
+    dtype = tensor_list[0].dtype
+    device = tensor_list[0].device
+    tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+    mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+    for img, pad_img, m in zip(tensor_list, tensor, mask):
+        pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img)
+        m[:img.shape[1], :img.shape[2]] = False
+    return NestedTensor(tensor, mask)
+
+
+def nested_tensor_from_videos_list(videos_list: List[Tensor]):
+    """
+    This function receives a list of videos (each of shape [T, C, H, W]) and returns a NestedTensor of the padded
+    videos (shape [T, B, C, PH, PW], along with their padding masks (true for padding areas, false otherwise, of shape
+    [T, B, PH, PW].
+    """
+    max_size = _max_by_axis([list(img.shape) for img in videos_list])
+    padded_batch_shape = [len(videos_list)] + max_size
+    b, t, c, h, w = padded_batch_shape
+    dtype = videos_list[0].dtype
+    device = videos_list[0].device
+    padded_videos = torch.zeros(padded_batch_shape, dtype=dtype, device=device)
+    videos_pad_masks = torch.ones((b, t, h, w),
+                                  dtype=torch.bool,
+                                  device=device)
+    for vid_frames, pad_vid_frames, vid_pad_m in zip(videos_list,
+                                                     padded_videos,
+                                                     videos_pad_masks):
+        pad_vid_frames[:vid_frames.shape[0], :, :vid_frames.
+                       shape[2], :vid_frames.shape[3]].copy_(vid_frames)
+        vid_pad_m[:vid_frames.shape[0], :vid_frames.shape[2], :vid_frames.
+                  shape[3]] = False
+    # transpose the temporal and batch dims and create a NestedTensor:
+    return NestedTensor(
+        padded_videos.transpose(0, 1), videos_pad_masks.transpose(0, 1))
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def interpolate(input,
+                size=None,
+                scale_factor=None,
+                mode='nearest',
+                align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if float(torchvision.__version__.split('.')[1]) < 7.0:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(input, size, scale_factor,
+                                                   mode, align_corners)
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor,
+                                                mode, align_corners)
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py b/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py
new file mode 100644
index 00000000..e603df6c
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py
@@ -0,0 +1,128 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+
+from .backbone import init_backbone
+from .misc import NestedTensor
+from .multimodal_transformer import MultimodalTransformer
+from .segmentation import FPNSpatialDecoder
+
+
+class MTTR(nn.Module):
+    """ The main module of the Multimodal Tracking Transformer """
+
+    def __init__(self,
+                 num_queries,
+                 mask_kernels_dim=8,
+                 aux_loss=False,
+                 **kwargs):
+        """
+        Parameters:
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         MTTR can detect in a single image. In our paper we use 50 in all settings.
+            mask_kernels_dim: dim of the segmentation kernels and of the feature maps outputted by the spatial decoder.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.backbone = init_backbone(**kwargs)
+        self.transformer = MultimodalTransformer(**kwargs)
+        d_model = self.transformer.d_model
+        self.is_referred_head = nn.Linear(
+            d_model,
+            2)  # binary 'is referred?' prediction head for object queries
+        self.instance_kernels_head = MLP(
+            d_model, d_model, output_dim=mask_kernels_dim, num_layers=2)
+        self.obj_queries = nn.Embedding(
+            num_queries, d_model)  # pos embeddings for the object queries
+        self.vid_embed_proj = nn.Conv2d(
+            self.backbone.layer_output_channels[-1], d_model, kernel_size=1)
+        self.spatial_decoder = FPNSpatialDecoder(
+            d_model, self.backbone.layer_output_channels[:-1][::-1],
+            mask_kernels_dim)
+        self.aux_loss = aux_loss
+
+    def forward(self, samples: NestedTensor, valid_indices, text_queries):
+        """The forward expects a NestedTensor, which consists of:
+               - samples.tensor: Batched frames of shape [time x batch_size x 3 x H x W]
+               - samples.mask: A binary mask of shape [time x batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_is_referred": The reference prediction logits for all queries.
+                                     Shape: [time x batch_size x num_queries x 2]
+               - "pred_masks": The mask logits for all queries.
+                               Shape: [time x batch_size x num_queries x H_mask x W_mask]
+               - "aux_outputs": Optional, only returned when auxiliary losses are activated. It is a list of
+                                dictionaries containing the two above keys for each decoder layer.
+        """
+        backbone_out = self.backbone(samples)
+        # keep only the valid frames (frames which are annotated):
+        # (for example, in a2d-sentences only the center frame in each window is annotated).
+        for layer_out in backbone_out:
+            layer_out.tensors = layer_out.tensors.index_select(
+                0, valid_indices)
+            layer_out.mask = layer_out.mask.index_select(0, valid_indices)
+        bbone_final_layer_output = backbone_out[-1]
+        vid_embeds, vid_pad_mask = bbone_final_layer_output.decompose()
+
+        T, B, _, _, _ = vid_embeds.shape
+        vid_embeds = rearrange(vid_embeds, 't b c h w -> (t b) c h w')
+        vid_embeds = self.vid_embed_proj(vid_embeds)
+        vid_embeds = rearrange(
+            vid_embeds, '(t b) c h w -> t b c h w', t=T, b=B)
+
+        transformer_out = self.transformer(vid_embeds, vid_pad_mask,
+                                           text_queries,
+                                           self.obj_queries.weight)
+        # hs is: [L, T, B, N, D] where L is number of decoder layers
+        # vid_memory is: [T, B, D, H, W]
+        # txt_memory is a list of length T*B of [S, C] where S might be different for each sentence
+        # encoder_middle_layer_outputs is a list of [T, B, H, W, D]
+        hs, vid_memory, txt_memory = transformer_out
+
+        vid_memory = rearrange(vid_memory, 't b d h w -> (t b) d h w')
+        bbone_middle_layer_outputs = [
+            rearrange(o.tensors, 't b d h w -> (t b) d h w')
+            for o in backbone_out[:-1][::-1]
+        ]
+        decoded_frame_features = self.spatial_decoder(
+            vid_memory, bbone_middle_layer_outputs)
+        decoded_frame_features = rearrange(
+            decoded_frame_features, '(t b) d h w -> t b d h w', t=T, b=B)
+        instance_kernels = self.instance_kernels_head(hs)  # [L, T, B, N, C]
+        # output masks is: [L, T, B, N, H_mask, W_mask]
+        output_masks = torch.einsum('ltbnc,tbchw->ltbnhw', instance_kernels,
+                                    decoded_frame_features)
+        outputs_is_referred = self.is_referred_head(hs)  # [L, T, B, N, 2]
+
+        layer_outputs = []
+        for pm, pir in zip(output_masks, outputs_is_referred):
+            layer_out = {'pred_masks': pm, 'pred_is_referred': pir}
+            layer_outputs.append(layer_out)
+        out = layer_outputs[
+            -1]  # the output for the last decoder layer is used by default
+        if self.aux_loss:
+            out['aux_outputs'] = layer_outputs[:-1]
+        return out
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
new file mode 100644
index 00000000..8c24e397
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
@@ -0,0 +1,440 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# MTTR Multimodal Transformer class.
+# Modified from DETR https://github.com/facebookresearch/detr
+
+import copy
+import os
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor, nn
+from transformers import RobertaModel, RobertaTokenizerFast
+
+from .position_encoding_2d import PositionEmbeddingSine2D
+
+os.environ[
+    'TOKENIZERS_PARALLELISM'] = 'false'  # this disables a huggingface tokenizer warning (printed every epoch)
+
+
+class MultimodalTransformer(nn.Module):
+
+    def __init__(self,
+                 num_encoder_layers=3,
+                 num_decoder_layers=3,
+                 text_encoder_type='roberta-base',
+                 freeze_text_encoder=True,
+                 **kwargs):
+        super().__init__()
+        self.d_model = kwargs['d_model']
+        encoder_layer = TransformerEncoderLayer(**kwargs)
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers)
+        decoder_layer = TransformerDecoderLayer(**kwargs)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            norm=nn.LayerNorm(self.d_model),
+            return_intermediate=True)
+        self.pos_encoder_2d = PositionEmbeddingSine2D()
+        self._reset_parameters()
+
+        self.text_encoder = RobertaModel.from_pretrained(text_encoder_type)
+        self.text_encoder.pooler = None  # this pooler is never used, this is a hack to avoid DDP problems...
+        self.tokenizer = RobertaTokenizerFast.from_pretrained(
+            text_encoder_type)
+        self.freeze_text_encoder = freeze_text_encoder
+        if freeze_text_encoder:
+            for p in self.text_encoder.parameters():
+                p.requires_grad_(False)
+
+        self.txt_proj = FeatureResizer(
+            input_feat_size=self.text_encoder.config.hidden_size,
+            output_feat_size=self.d_model,
+            dropout=kwargs['dropout'],
+        )
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, vid_embeds, vid_pad_mask, text_queries, obj_queries):
+        device = vid_embeds.device
+        t, b, _, h, w = vid_embeds.shape
+
+        txt_memory, txt_pad_mask = self.forward_text(text_queries, device)
+        # add temporal dim to txt memory & padding mask:
+        txt_memory = repeat(txt_memory, 's b c -> s (t b) c', t=t)
+        txt_pad_mask = repeat(txt_pad_mask, 'b s -> (t b) s', t=t)
+
+        vid_embeds = rearrange(vid_embeds, 't b c h w -> (h w) (t b) c')
+        # Concat the image & text embeddings on the sequence dimension
+        encoder_src_seq = torch.cat((vid_embeds, txt_memory), dim=0)
+        seq_mask = torch.cat(
+            (rearrange(vid_pad_mask, 't b h w -> (t b) (h w)'), txt_pad_mask),
+            dim=1)
+        # vid_pos_embed is: [T*B, H, W, d_model]
+        vid_pos_embed = self.pos_encoder_2d(
+            rearrange(vid_pad_mask, 't b h w -> (t b) h w'), self.d_model)
+        # use zeros in place of pos embeds for the text sequence:
+        pos_embed = torch.cat(
+            (rearrange(vid_pos_embed, 't_b h w c -> (h w) t_b c'),
+             torch.zeros_like(txt_memory)),
+            dim=0)
+
+        memory = self.encoder(
+            encoder_src_seq, src_key_padding_mask=seq_mask,
+            pos=pos_embed)  # [S, T*B, C]
+        vid_memory = rearrange(
+            memory[:h * w, :, :],
+            '(h w) (t b) c -> t b c h w',
+            h=h,
+            w=w,
+            t=t,
+            b=b)
+        txt_memory = memory[h * w:, :, :]
+        txt_memory = rearrange(txt_memory, 's t_b c -> t_b s c')
+        txt_memory = [
+            t_mem[~pad_mask]
+            for t_mem, pad_mask in zip(txt_memory, txt_pad_mask)
+        ]  # remove padding
+
+        # add T*B dims to query embeds (was: [N, C], where N is the number of object queries):
+        obj_queries = repeat(obj_queries, 'n c -> n (t b) c', t=t, b=b)
+        tgt = torch.zeros_like(obj_queries)  # [N, T*B, C]
+
+        # hs is [L, N, T*B, C] where L is number of layers in the decoder
+        hs = self.decoder(
+            tgt,
+            memory,
+            memory_key_padding_mask=seq_mask,
+            pos=pos_embed,
+            query_pos=obj_queries)
+        hs = rearrange(hs, 'l n (t b) c -> l t b n c', t=t, b=b)
+        return hs, vid_memory, txt_memory
+
+    def forward_text(self, text_queries, device):
+        tokenized_queries = self.tokenizer.batch_encode_plus(
+            text_queries, padding='longest', return_tensors='pt')
+        tokenized_queries = tokenized_queries.to(device)
+        with torch.inference_mode(mode=self.freeze_text_encoder):
+            encoded_text = self.text_encoder(**tokenized_queries)
+        # Transpose memory because pytorch's attention expects sequence first
+        txt_memory = rearrange(encoded_text.last_hidden_state,
+                               'b s c -> s b c')
+        txt_memory = self.txt_proj(
+            txt_memory)  # change text embeddings dim to model dim
+        # Invert attention mask that we get from huggingface because its the opposite in pytorch transformer
+        txt_pad_mask = tokenized_queries.attention_mask.ne(1).bool()  # [B, S]
+        return txt_memory, txt_pad_mask
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                src_mask=mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nheads,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 normalize_before=False,
+                 **kwargs):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nheads, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self,
+                    src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src2,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self,
+                src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nheads,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 normalize_before=False,
+                 **kwargs):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nheads, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nheads, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     tgt,
+                     memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(self,
+                    tgt,
+                    memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt2,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask,
+                                 pos, query_pos)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == 'relu':
+        return F.relu
+    if activation == 'gelu':
+        return F.gelu
+    if activation == 'glu':
+        return F.glu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py b/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py
new file mode 100644
index 00000000..f9ef05a1
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py
@@ -0,0 +1,57 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+# 2D sine positional encodings for the visual features in the multimodal transformer.
+
+import math
+
+import torch
+from torch import Tensor, nn
+
+
+class PositionEmbeddingSine2D(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, temperature=10000, normalize=True, scale=None):
+        super().__init__()
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, mask: Tensor, hidden_dim: int):
+        """
+        @param mask: a tensor of shape [B, H, W]
+        @param hidden_dim: int
+        @return:
+        """
+        num_pos_feats = hidden_dim // 2
+
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3)
+        return pos
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
new file mode 100644
index 00000000..64582140
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
@@ -0,0 +1,119 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+
+class A2DSentencesPostProcess(nn.Module):
+    """
+    This module converts the model's output into the format expected by the coco api for the given task
+    """
+
+    def __init__(self):
+        super(A2DSentencesPostProcess, self).__init__()
+
+    @torch.inference_mode()
+    def forward(self, outputs, resized_padded_sample_size,
+                resized_sample_sizes, orig_sample_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            resized_padded_sample_size: size of samples (input to model) after size augmentation + padding.
+            resized_sample_sizes: size of samples after size augmentation but without padding.
+            orig_sample_sizes: original size of the samples (no augmentations or padding)
+        """
+        pred_is_referred = outputs['pred_is_referred']
+        prob = F.softmax(pred_is_referred, dim=-1)
+        scores = prob[..., 0]
+        pred_masks = outputs['pred_masks']
+        pred_masks = F.interpolate(
+            pred_masks,
+            size=resized_padded_sample_size,
+            mode='bilinear',
+            align_corners=False)
+        pred_masks = (pred_masks.sigmoid() > 0.5)
+        processed_pred_masks, rle_masks = [], []
+        for f_pred_masks, resized_size, orig_size in zip(
+                pred_masks, resized_sample_sizes, orig_sample_sizes):
+            f_mask_h, f_mask_w = resized_size  # resized shape without padding
+            # remove the samples' padding
+            f_pred_masks_no_pad = f_pred_masks[:, :f_mask_h, :
+                                               f_mask_w].unsqueeze(1)
+            # resize the samples back to their original dataset (target) size for evaluation
+            f_pred_masks_processed = F.interpolate(
+                f_pred_masks_no_pad.float(), size=orig_size, mode='nearest')
+            f_pred_rle_masks = [
+                mask_util.encode(
+                    np.array(
+                        mask[0, :, :, np.newaxis], dtype=np.uint8,
+                        order='F'))[0]
+                for mask in f_pred_masks_processed.cpu()
+            ]
+            processed_pred_masks.append(f_pred_masks_processed)
+            rle_masks.append(f_pred_rle_masks)
+        predictions = [{
+            'scores': s,
+            'masks': m,
+            'rle_masks': rle
+        } for s, m, rle in zip(scores, processed_pred_masks, rle_masks)]
+        return predictions
+
+
+class ReferYoutubeVOSPostProcess(nn.Module):
+    """
+    This module converts the model's output into the format expected by the coco api for the given task
+    """
+
+    def __init__(self):
+        super(ReferYoutubeVOSPostProcess, self).__init__()
+
+    @torch.inference_mode()
+    def forward(self, outputs, videos_metadata, samples_shape_with_padding):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            videos_metadata: a dictionary with each video's metadata.
+            samples_shape_with_padding: size of the batch frames with padding.
+        """
+        pred_is_referred = outputs['pred_is_referred']
+        prob_is_referred = F.softmax(pred_is_referred, dim=-1)
+        # note we average on the temporal dim to compute score per trajectory:
+        trajectory_scores = prob_is_referred[..., 0].mean(dim=0)
+        pred_trajectory_indices = torch.argmax(trajectory_scores, dim=-1)
+        pred_masks = rearrange(outputs['pred_masks'],
+                               't b nq h w -> b t nq h w')
+        # keep only the masks of the chosen trajectories:
+        b = pred_masks.shape[0]
+        pred_masks = pred_masks[torch.arange(b), :, pred_trajectory_indices]
+        # resize the predicted masks to the size of the model input (which might include padding)
+        pred_masks = F.interpolate(
+            pred_masks,
+            size=samples_shape_with_padding,
+            mode='bilinear',
+            align_corners=False)
+        # apply a threshold to create binary masks:
+        pred_masks = (pred_masks.sigmoid() > 0.5)
+        # remove the padding per video (as videos might have different resolutions and thus different padding):
+        preds_by_video = []
+        for video_pred_masks, video_metadata in zip(pred_masks,
+                                                    videos_metadata):
+            # size of the model input batch frames without padding:
+            resized_h, resized_w = video_metadata['resized_frame_size']
+            video_pred_masks = video_pred_masks[:, :resized_h, :
+                                                resized_w].unsqueeze(
+                                                    1)  # remove the padding
+            # resize the masks back to their original frames dataset size for evaluation:
+            original_frames_size = video_metadata['original_frame_size']
+            tuple_size = tuple(original_frames_size.cpu().numpy())
+            video_pred_masks = F.interpolate(
+                video_pred_masks.float(), size=tuple_size, mode='nearest')
+            video_pred_masks = video_pred_masks.to(torch.uint8).cpu()
+            # combine the predicted masks and the video metadata to create a final predictions dict:
+            video_pred = {**video_metadata, **{'pred_masks': video_pred_masks}}
+            preds_by_video.append(video_pred)
+        return preds_by_video
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py b/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py
new file mode 100644
index 00000000..b3228820
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py
@@ -0,0 +1,137 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+class FPNSpatialDecoder(nn.Module):
+    """
+    An FPN-like spatial decoder. Generates high-res, semantically rich features which serve as the base for creating
+    instance segmentation masks.
+    """
+
+    def __init__(self, context_dim, fpn_dims, mask_kernels_dim=8):
+        super().__init__()
+
+        inter_dims = [
+            context_dim, context_dim // 2, context_dim // 4, context_dim // 8,
+            context_dim // 16
+        ]
+        self.lay1 = torch.nn.Conv2d(context_dim, inter_dims[0], 3, padding=1)
+        self.gn1 = torch.nn.GroupNorm(8, inter_dims[0])
+        self.lay2 = torch.nn.Conv2d(inter_dims[0], inter_dims[1], 3, padding=1)
+        self.gn2 = torch.nn.GroupNorm(8, inter_dims[1])
+        self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = torch.nn.GroupNorm(8, inter_dims[2])
+        self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = torch.nn.GroupNorm(8, inter_dims[3])
+        self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.context_dim = context_dim
+
+        self.add_extra_layer = len(fpn_dims) == 3
+        if self.add_extra_layer:
+            self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+            self.lay5 = torch.nn.Conv2d(
+                inter_dims[3], inter_dims[4], 3, padding=1)
+            self.gn5 = torch.nn.GroupNorm(8, inter_dims[4])
+            self.out_lay = torch.nn.Conv2d(
+                inter_dims[4], mask_kernels_dim, 3, padding=1)
+        else:
+            self.out_lay = torch.nn.Conv2d(
+                inter_dims[3], mask_kernels_dim, 3, padding=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, layer_features: List[Tensor]):
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = F.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter1(layer_features[0])
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode='nearest')
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = F.relu(x)
+
+        cur_fpn = self.adapter2(layer_features[1])
+        x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode='nearest')
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = F.relu(x)
+
+        if self.add_extra_layer:
+            cur_fpn = self.adapter3(layer_features[2])
+            x = cur_fpn + F.interpolate(
+                x, size=cur_fpn.shape[-2:], mode='nearest')
+            x = self.lay5(x)
+            x = self.gn5(x)
+            x = F.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+    def num_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+def dice_loss(inputs, targets, num_masks):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+
+
+def sigmoid_focal_loss(inputs,
+                       targets,
+                       num_masks,
+                       alpha: float = 0.25,
+                       gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction='none')
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_masks
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
new file mode 100644
index 00000000..9a08ef48
--- /dev/null
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
@@ -0,0 +1,731 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from Video-Swin-Transformer https://github.com/SwinTransformer/Video-Swin-Transformer
+
+from functools import lru_cache, reduce
+from operator import mul
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath, trunc_normal_
+
+
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, D, H, W, C)
+        window_size (tuple[int]): window size
+
+    Returns:
+        windows: (B*num_windows, window_size*window_size, C)
+    """
+    B, D, H, W, C = x.shape
+    x = x.view(B, D // window_size[0], window_size[0], H // window_size[1],
+               window_size[1], W // window_size[2], window_size[2], C)
+    windows = x.permute(0, 1, 3, 5, 2, 4, 6,
+                        7).contiguous().view(-1, reduce(mul, window_size), C)
+    return windows
+
+
+def window_reverse(windows, window_size, B, D, H, W):
+    """
+    Args:
+        windows: (B*num_windows, window_size, window_size, C)
+        window_size (tuple[int]): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, D, H, W, C)
+    """
+    x = windows.view(B, D // window_size[0], H // window_size[1],
+                     W // window_size[2], window_size[0], window_size[1],
+                     window_size[2], -1)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1)
+    return x
+
+
+def get_window_size(x_size, window_size, shift_size=None):
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+
+
+class WindowAttention3D(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The temporal length, height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        wd, wh, ww = window_size
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * wd - 1) * (2 * wh - 1) * (2 * ww - 1), num_heads))
+
+        # get pair-wise relative position index for each token inside the window
+        coords_d = torch.arange(self.window_size[0])
+        coords_h = torch.arange(self.window_size[1])
+        coords_w = torch.arange(self.window_size[2])
+        coords = torch.stack(torch.meshgrid(coords_d, coords_h,
+                                            coords_w))  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1]
+                                     - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1)
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, N, N) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B_, nH, N, C
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index[:N, :N].reshape(-1)].reshape(
+                N, N, -1)  # Wd*Wh*Ww,Wd*Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wd*Wh*Ww, Wd*Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)  # B_, nH, N, N
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock3D(nn.Module):
+    """ Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): Window size.
+        shift_size (tuple[int]): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(2, 7, 7),
+                 shift_size=(0, 0, 0),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+
+        assert 0 <= self.shift_size[0] < self.window_size[
+            0], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[1] < self.window_size[
+            1], 'shift_size must in 0-window_size'
+        assert 0 <= self.shift_size[2] < self.window_size[
+            2], 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3D(
+            dim,
+            window_size=self.window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward_part1(self, x, mask_matrix):
+        B, D, H, W, C = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+
+        x = self.norm1(x)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = pad_d0 = 0
+        pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0]
+        pad_b = (window_size[1] - H % window_size[1]) % window_size[1]
+        pad_r = (window_size[2] - W % window_size[2]) % window_size[2]
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+        _, Dp, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if any(i > 0 for i in shift_size):
+            shifted_x = torch.roll(
+                x,
+                shifts=(-shift_size[0], -shift_size[1], -shift_size[2]),
+                dims=(1, 2, 3))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x,
+                                     window_size)  # B*nW, Wd*Wh*Ww, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # B*nW, Wd*Wh*Ww, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, *(window_size + (C, )))
+        shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp,
+                                   Wp)  # B D' H' W' C
+        # reverse cyclic shift
+        if any(i > 0 for i in shift_size):
+            x = torch.roll(
+                shifted_x,
+                shifts=(shift_size[0], shift_size[1], shift_size[2]),
+                dims=(1, 2, 3))
+        else:
+            x = shifted_x
+
+        if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+            x = x[:, :D, :H, :W, :].contiguous()
+        return x
+
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+            mask_matrix: Attention mask for cyclic shift.
+        """
+
+        shortcut = x
+        if self.use_checkpoint:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+
+        if self.use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, D, H, W, C).
+        """
+        B, D, H, W, C = x.shape
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, :, 0::2, 0::2, :]  # B D H/2 W/2 C
+        x1 = x[:, :, 1::2, 0::2, :]  # B D H/2 W/2 C
+        x2 = x[:, :, 0::2, 1::2, :]  # B D H/2 W/2 C
+        x3 = x[:, :, 1::2, 1::2, :]  # B D H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B D H/2 W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+# cache each stage results
+@lru_cache()
+def compute_mask(D, H, W, window_size, shift_size, device):
+    img_mask = torch.zeros((1, D, H, W, 1), device=device)  # 1 Dp Hp Wp 1
+    cnt = 0
+    for d in slice(-window_size[0]), slice(-window_size[0],
+                                           -shift_size[0]), slice(
+                                               -shift_size[0], None):
+        for h in slice(-window_size[1]), slice(-window_size[1],
+                                               -shift_size[1]), slice(
+                                                   -shift_size[1], None):
+            for w in slice(-window_size[2]), slice(-window_size[2],
+                                                   -shift_size[2]), slice(
+                                                       -shift_size[2], None):
+                img_mask[:, d, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask,
+                                    window_size)  # nW, ws[0]*ws[1]*ws[2], 1
+    mask_windows = mask_windows.squeeze(-1)  # nW, ws[0]*ws[1]*ws[2]
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                      float(-100.0)).masked_fill(
+                                          attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (tuple[int]): Local window size. Default: (1,7,7).
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=(1, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock3D(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_checkpoint=use_checkpoint,
+            ) for i in range(depth)
+        ])
+
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+
+    def forward(self, x):
+        """ Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, C, D, H, W).
+        """
+        # calculate attention mask for SW-MSA
+        B, C, D, H, W = x.shape
+        window_size, shift_size = get_window_size((D, H, W), self.window_size,
+                                                  self.shift_size)
+        x = rearrange(x, 'b c d h w -> b d h w c')
+        Dp = int(np.ceil(D / window_size[0])) * window_size[0]
+        Hp = int(np.ceil(H / window_size[1])) * window_size[1]
+        Wp = int(np.ceil(W / window_size[2])) * window_size[2]
+        attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device)
+        for blk in self.blocks:
+            x = blk(x, attn_mask)
+        x = x.view(B, D, H, W, -1)
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b c d h w')
+        return x
+
+
+class PatchEmbed3D(nn.Module):
+    """ Video to Patch Embedding.
+
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 patch_size=(2, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(
+                x,
+                (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer3D(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        patch_size (int | tuple(int)): Patch size. Default: (4,4,4).
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer: Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+    """
+
+    def __init__(self,
+                 pretrained=None,
+                 pretrained2d=True,
+                 patch_size=(4, 4, 4),
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=(2, 7, 7),
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 patch_norm=False,
+                 frozen_stages=-1,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrained = pretrained
+        self.pretrained2d = pretrained2d
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.window_size = window_size
+        self.patch_size = patch_size
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed3D(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if i_layer < self.num_layers - 1 else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+
+        # add a norm layer for each output
+        self.norm = norm_layer(self.num_features)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def inflate_weights(self, logger):
+        """Inflate the swin2d parameters to swin3d.
+
+        The differences between swin3d and swin2d mainly lie in an extra
+        axis. To utilize the pretrained parameters in 2d model,
+        the weight of swin2d models should be inflated to fit in the shapes of
+        the 3d counterpart.
+
+        Args:
+            logger (logging.Logger): The logger used to print
+                debugging infomation.
+        """
+        checkpoint = torch.load(self.pretrained, map_location='cpu')
+        state_dict = checkpoint['model']
+
+        # delete relative_position_index since we always re-init it
+        relative_position_index_keys = [
+            k for k in state_dict.keys() if 'relative_position_index' in k
+        ]
+        for k in relative_position_index_keys:
+            del state_dict[k]
+
+        # delete attn_mask since we always re-init it
+        attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k]
+        for k in attn_mask_keys:
+            del state_dict[k]
+
+        state_dict['patch_embed.proj.weight'] = state_dict[
+            'patch_embed.proj.weight'].unsqueeze(2).repeat(
+                1, 1, self.patch_size[0], 1, 1) / self.patch_size[0]
+
+        # bicubic interpolate relative_position_bias_table if not match
+        relative_position_bias_table_keys = [
+            k for k in state_dict.keys() if 'relative_position_bias_table' in k
+        ]
+        for k in relative_position_bias_table_keys:
+            relative_position_bias_table_pretrained = state_dict[k]
+            relative_position_bias_table_current = self.state_dict()[k]
+            L1, nH1 = relative_position_bias_table_pretrained.size()
+            L2, nH2 = relative_position_bias_table_current.size()
+            L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            wd = self.window_size[0]
+            if nH1 != nH2:
+                logger.warning(f'Error in loading {k}, passing')
+            else:
+                if L1 != L2:
+                    S1 = int(L1**0.5)
+                    relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate(
+                        relative_position_bias_table_pretrained.permute(
+                            1, 0).view(1, nH1, S1, S1),
+                        size=(2 * self.window_size[1] - 1,
+                              2 * self.window_size[2] - 1),
+                        mode='bicubic')
+                    relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0)
+            state_dict[k] = relative_position_bias_table_pretrained.repeat(
+                2 * wd - 1, 1)
+
+        msg = self.load_state_dict(state_dict, strict=False)
+        logger.info(msg)
+        logger.info(f"=> loaded successfully '{self.pretrained}'")
+        del checkpoint
+        torch.cuda.empty_cache()
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x.contiguous())
+
+        x = rearrange(x, 'n c d h w -> n d h w c')
+        x = self.norm(x)
+        x = rearrange(x, 'n d h w c -> n c d h w')
+
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer3D, self).train(mode)
+        self._freeze_stages()
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index a49ddacf..fbe15646 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -417,6 +417,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.video_summarization: [OutputKeys.OUTPUT],
 
+    # referring video object segmentation result for a single video
+    #   {
+    #       "masks": [np.array # 2D array with shape [height, width]]
+    #   }
+    Tasks.referring_video_object_segmentation: [OutputKeys.MASKS],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 174d10b1..8098bdec 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -202,6 +202,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
     Tasks.product_segmentation: (Pipelines.product_segmentation,
                                  'damo/cv_F3Net_product-segmentation'),
+    Tasks.referring_video_object_segmentation:
+    (Pipelines.referring_video_object_segmentation,
+     'damo/cv_swin-t_referring_video-object-segmentation'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index f84f5fe5..97cd8761 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -58,6 +58,7 @@ if TYPE_CHECKING:
     from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline
     from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin
     from .hand_static_pipeline import HandStaticPipeline
+    from .referring_video_object_segmentation_pipeline import ReferringVideoObjectSegmentationPipeline
 
 else:
     _import_structure = {
@@ -128,6 +129,9 @@ else:
         ['FacialExpressionRecognitionPipeline'],
         'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'],
         'hand_static_pipeline': ['HandStaticPipeline'],
+        'referring_video_object_segmentation_pipeline': [
+            'ReferringVideoObjectSegmentationPipeline'
+        ],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
new file mode 100644
index 00000000..d264b386
--- /dev/null
+++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
@@ -0,0 +1,193 @@
+# The implementation here is modified based on MTTR,
+# originally Apache 2.0 License and publicly avaialbe at https://github.com/mttr2021/MTTR
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms.functional as F
+from einops import rearrange
+from moviepy.editor import AudioFileClip, ImageSequenceClip, VideoFileClip
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+from tqdm import tqdm
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.referring_video_object_segmentation,
+    module_name=Pipelines.referring_video_object_segmentation)
+class ReferringVideoObjectSegmentationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """use `model` to create a referring video object segmentation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub
+        """
+        _device = kwargs.pop('device', 'gpu')
+        if torch.cuda.is_available() and _device == 'gpu':
+            self.device = 'gpu'
+        else:
+            self.device = 'cpu'
+        super().__init__(model=model, device=self.device, **kwargs)
+
+        logger.info('Load model done!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """
+
+        Args:
+            input: path of the input video
+
+        """
+        assert isinstance(input, tuple) and len(
+            input
+        ) == 4, 'error - input type must be tuple and input length must be 4'
+        self.input_video_pth, text_queries, start_pt, end_pt = input
+
+        assert 0 < end_pt - start_pt <= 10, 'error - the subclip length must be 0-10 seconds long'
+        assert 1 <= len(
+            text_queries) <= 2, 'error - 1-2 input text queries are expected'
+
+        # extract the relevant subclip:
+        self.input_clip_pth = 'input_clip.mp4'
+        with VideoFileClip(self.input_video_pth) as video:
+            subclip = video.subclip(start_pt, end_pt)
+            subclip.write_videofile(self.input_clip_pth)
+
+        self.window_length = 24  # length of window during inference
+        self.window_overlap = 6  # overlap (in frames) between consecutive windows
+
+        self.video, audio, self.meta = torchvision.io.read_video(
+            filename=self.input_clip_pth)
+        self.video = rearrange(self.video, 't h w c -> t c h w')
+
+        input_video = F.resize(self.video, size=360, max_size=640)
+        if self.device_name == 'gpu':
+            input_video = input_video.cuda()
+
+        input_video = input_video.to(torch.float).div_(255)
+        input_video = F.normalize(
+            input_video, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        video_metadata = {
+            'resized_frame_size': input_video.shape[-2:],
+            'original_frame_size': self.video.shape[-2:]
+        }
+
+        # partition the clip into overlapping windows of frames:
+        windows = [
+            input_video[i:i + self.window_length]
+            for i in range(0, len(input_video), self.window_length
+                           - self.window_overlap)
+        ]
+        # clean up the text queries:
+        self.text_queries = [' '.join(q.lower().split()) for q in text_queries]
+
+        result = {
+            'text_queries': self.text_queries,
+            'windows': windows,
+            'video_metadata': video_metadata
+        }
+
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            pred_masks_per_query = []
+            t, _, h, w = self.video.shape
+            for text_query in tqdm(input['text_queries'], desc='text queries'):
+                pred_masks = torch.zeros(size=(t, 1, h, w))
+                for i, window in enumerate(
+                        tqdm(input['windows'], desc='windows')):
+
+                    window_masks = self.model.inference(
+                        window=window,
+                        text_query=text_query,
+                        metadata=input['video_metadata'])
+
+                    win_start_idx = i * (
+                        self.window_length - self.window_overlap)
+                    pred_masks[win_start_idx:win_start_idx
+                               + self.window_length] = window_masks
+                pred_masks_per_query.append(pred_masks)
+        return pred_masks_per_query
+
+    def postprocess(self, inputs) -> Dict[str, Any]:
+        if self.model.cfg.pipeline.save_masked_video:
+            # RGB colors for instance masks:
+            light_blue = (41, 171, 226)
+            purple = (237, 30, 121)
+            dark_green = (35, 161, 90)
+            orange = (255, 148, 59)
+            colors = np.array([light_blue, purple, dark_green, orange])
+
+            # width (in pixels) of the black strip above the video on which the text queries will be displayed:
+            text_border_height_per_query = 36
+
+            video_np = rearrange(self.video,
+                                 't c h w -> t h w c').numpy() / 255.0
+
+            # del video
+            pred_masks_per_frame = rearrange(
+                torch.stack(inputs), 'q t 1 h w -> t q h w').numpy()
+            masked_video = []
+            for vid_frame, frame_masks in tqdm(
+                    zip(video_np, pred_masks_per_frame),
+                    total=len(video_np),
+                    desc='applying masks...'):
+                # apply the masks:
+                for inst_mask, color in zip(frame_masks, colors):
+                    vid_frame = apply_mask(vid_frame, inst_mask, color / 255.0)
+                vid_frame = Image.fromarray((vid_frame * 255).astype(np.uint8))
+                # visualize the text queries:
+                vid_frame = ImageOps.expand(
+                    vid_frame,
+                    border=(0, len(self.text_queries)
+                            * text_border_height_per_query, 0, 0))
+                W, H = vid_frame.size
+                draw = ImageDraw.Draw(vid_frame)
+                font = ImageFont.truetype(font='DejaVuSansMono.ttf', size=30)
+                for i, (text_query, color) in enumerate(
+                        zip(self.text_queries, colors), start=1):
+                    w, h = draw.textsize(text_query, font=font)
+                    draw.text(((W - w) / 2,
+                               (text_border_height_per_query * i) - h - 3),
+                              text_query,
+                              fill=tuple(color) + (255, ),
+                              font=font)
+                masked_video.append(np.array(vid_frame))
+            print(type(vid_frame))
+            print(type(masked_video[0]))
+            print(masked_video[0].shape)
+            # generate and save the output clip:
+
+            assert self.model.cfg.pipeline.output_path
+            output_clip_path = self.model.cfg.pipeline.output_path
+            clip = ImageSequenceClip(
+                sequence=masked_video, fps=self.meta['video_fps'])
+            clip = clip.set_audio(AudioFileClip(self.input_clip_pth))
+            clip.write_videofile(
+                output_clip_path, fps=self.meta['video_fps'], audio=True)
+            del masked_video
+
+        result = {OutputKeys.MASKS: inputs}
+        return result
+
+
+def apply_mask(image, mask, color, transparency=0.7):
+    mask = mask[..., np.newaxis].repeat(repeats=3, axis=2)
+    mask = mask * transparency
+    color_matrix = np.ones(image.shape, dtype=np.float) * color
+    out_image = color_matrix * mask + image * (1.0 - mask)
+    return out_image
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 0eb369da..6ba58c19 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -80,6 +80,9 @@ class CVTasks(object):
     virtual_try_on = 'virtual-try-on'
     movie_scene_segmentation = 'movie-scene-segmentation'
 
+    # video segmentation
+    referring_video_object_segmentation = 'referring-video-object-segmentation'
+
     # video editing
     video_inpainting = 'video-inpainting'
 
diff --git a/requirements/cv.txt b/requirements/cv.txt
index eb38beb1..d23fab3a 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1,4 +1,5 @@
 albumentations>=1.0.3
+av>=9.2.0
 easydict
 fairscale>=0.4.1
 fastai>=1.0.51
@@ -14,6 +15,7 @@ lpips
 ml_collections
 mmcls>=0.21.0
 mmdet>=2.25.0
+moviepy>=1.0.3
 networkx>=2.5
 numba
 onnxruntime>=1.10
diff --git a/tests/pipelines/test_referring_video_object_segmentation.py b/tests/pipelines/test_referring_video_object_segmentation.py
new file mode 100644
index 00000000..3e81d9c3
--- /dev/null
+++ b/tests/pipelines/test_referring_video_object_segmentation.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ReferringVideoObjectSegmentationTest(unittest.TestCase,
+                                           DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.referring_video_object_segmentation
+        self.model_id = 'damo/cv_swin-t_referring_video-object-segmentation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_referring_video_object_segmentation(self):
+        input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4'
+        text_queries = [
+            'guy in black performing tricks on a bike',
+            'a black bike used to perform tricks'
+        ]
+        start_pt, end_pt = 4, 14
+        input_tuple = (input_location, text_queries, start_pt, end_pt)
+        pp = pipeline(
+            Tasks.referring_video_object_segmentation, model=self.model_id)
+        result = pp(input_tuple)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_referring_video_object_segmentation_with_default_task(self):
+        input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4'
+        text_queries = [
+            'guy in black performing tricks on a bike',
+            'a black bike used to perform tricks'
+        ]
+        start_pt, end_pt = 4, 14
+        input_tuple = (input_location, text_queries, start_pt, end_pt)
+        pp = pipeline(Tasks.referring_video_object_segmentation)
+        result = pp(input_tuple)
+        if result:
+            print(result)
+        else:
+            raise ValueError('process error')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6438c41144eb779be9b649ccac860991658e8dc1 Mon Sep 17 00:00:00 2001
From: "yongfei.zyf" <yongfei.zyf@alibaba-inc.com>
Date: Tue, 18 Oct 2022 16:50:47 +0800
Subject: [PATCH 065/129] [to #42322933]Support video url input processing     
    Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10437340

---
 modelscope/preprocessors/video.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
index f693cd9e..794033b5 100644
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -1,5 +1,10 @@
 import math
+import os
 import random
+import uuid
+from os.path import exists
+from tempfile import TemporaryDirectory
+from urllib.parse import urlparse
 
 import numpy as np
 import torch
@@ -9,6 +14,7 @@ import torchvision.transforms._transforms_video as transforms
 from decord import VideoReader
 from torchvision.transforms import Compose
 
+from modelscope.hub.file_download import http_get_file
 from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields, ModeKeys
 from modelscope.utils.type_assert import type_assert
@@ -30,7 +36,22 @@ def ReadVideoData(cfg,
     Returns:
         data (Tensor): the normalized video clips for model inputs
     """
-    data = _decode_video(cfg, video_path, num_temporal_views_override)
+    url_parsed = urlparse(video_path)
+    if url_parsed.scheme in ('file', '') and exists(
+            url_parsed.path):  # Possibly a local file
+        data = _decode_video(cfg, video_path, num_temporal_views_override)
+    else:
+        with TemporaryDirectory() as temporary_cache_dir:
+            random_str = uuid.uuid4().hex
+            http_get_file(
+                url=video_path,
+                local_dir=temporary_cache_dir,
+                file_name=random_str,
+                cookies=None)
+            temp_file_path = os.path.join(temporary_cache_dir, random_str)
+            data = _decode_video(cfg, temp_file_path,
+                                 num_temporal_views_override)
+
     if num_spatial_crops_override is not None:
         num_spatial_crops = num_spatial_crops_override
         transform = kinetics400_tranform(cfg, num_spatial_crops_override)

From 865397763ef51773f2285efde5df001f624f7bf3 Mon Sep 17 00:00:00 2001
From: "xianzhe.xxz" <xianzhe.xxz@alibaba-inc.com>
Date: Tue, 18 Oct 2022 16:53:29 +0800
Subject: [PATCH 066/129] [to #42322933]add damoyolo model in
 tinynas-object-detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

接入damyolo系列检测模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10377688
---
 modelscope/metainfo.py                        |  2 +
 .../models/cv/tinynas_detection/__init__.py   |  2 +
 .../cv/tinynas_detection/backbone/tinynas.py  | 29 ++++++----
 .../models/cv/tinynas_detection/detector.py   |  5 +-
 .../tinynas_detection/head/gfocal_v2_tiny.py  | 53 ++++++++++++-------
 .../tinynas_detection/neck/giraffe_fpn_v2.py  |  5 +-
 .../cv/tinynas_detection/tinynas_damoyolo.py  | 15 ++++++
 .../cv/tinynas_detection/tinynas_detector.py  |  2 +-
 .../cv/tinynas_detection_pipeline.py          | 22 +++++---
 modelscope/utils/file_utils.py                |  8 +++
 tests/pipelines/test_tinynas_detection.py     | 29 ++++++++--
 11 files changed, 127 insertions(+), 45 deletions(-)
 create mode 100644 modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index fc18ead9..6c8d91fa 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -9,7 +9,9 @@ class Models(object):
 
         Model name should only contain model info but not task info.
     """
+    # tinynas models
     tinynas_detection = 'tinynas-detection'
+    tinynas_damoyolo = 'tinynas-damoyolo'
 
     # vision models
     detection = 'detection'
diff --git a/modelscope/models/cv/tinynas_detection/__init__.py b/modelscope/models/cv/tinynas_detection/__init__.py
index 13532d10..6d696ac4 100644
--- a/modelscope/models/cv/tinynas_detection/__init__.py
+++ b/modelscope/models/cv/tinynas_detection/__init__.py
@@ -7,10 +7,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .tinynas_detector import Tinynas_detector
+    from .tinynas_damoyolo import DamoYolo
 
 else:
     _import_structure = {
         'tinynas_detector': ['TinynasDetector'],
+        'tinynas_damoyolo': ['DamoYolo'],
     }
 
     import sys
diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
index 814ee550..87a28a2f 100755
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
+++ b/modelscope/models/cv/tinynas_detection/backbone/tinynas.py
@@ -4,6 +4,7 @@
 import torch
 import torch.nn as nn
 
+from modelscope.utils.file_utils import read_file
 from ..core.base_ops import Focus, SPPBottleneck, get_activation
 from ..core.repvgg_block import RepVggBlock
 
@@ -49,12 +50,16 @@ class ResConvK1KX(nn.Module):
                  kernel_size,
                  stride,
                  force_resproj=False,
-                 act='silu'):
+                 act='silu',
+                 reparam=False):
         super(ResConvK1KX, self).__init__()
         self.stride = stride
         self.conv1 = ConvKXBN(in_c, btn_c, 1, 1)
-        self.conv2 = RepVggBlock(
-            btn_c, out_c, kernel_size, stride, act='identity')
+        if not reparam:
+            self.conv2 = ConvKXBN(btn_c, out_c, 3, stride)
+        else:
+            self.conv2 = RepVggBlock(
+                btn_c, out_c, kernel_size, stride, act='identity')
 
         if act is None:
             self.activation_function = torch.relu
@@ -97,7 +102,8 @@ class SuperResConvK1KX(nn.Module):
                  stride,
                  num_blocks,
                  with_spp=False,
-                 act='silu'):
+                 act='silu',
+                 reparam=False):
         super(SuperResConvK1KX, self).__init__()
         if act is None:
             self.act = torch.relu
@@ -124,7 +130,8 @@ class SuperResConvK1KX(nn.Module):
                 this_kernel_size,
                 this_stride,
                 force_resproj,
-                act=act)
+                act=act,
+                reparam=reparam)
             self.block_list.append(the_block)
             if block_id == 0 and with_spp:
                 self.block_list.append(
@@ -248,7 +255,8 @@ class TinyNAS(nn.Module):
                  with_spp=False,
                  use_focus=False,
                  need_conv1=True,
-                 act='silu'):
+                 act='silu',
+                 reparam=False):
         super(TinyNAS, self).__init__()
         assert len(out_indices) == len(out_channels)
         self.out_indices = out_indices
@@ -281,7 +289,8 @@ class TinyNAS(nn.Module):
                     block_info['s'],
                     block_info['L'],
                     spp,
-                    act=act)
+                    act=act,
+                    reparam=reparam)
                 self.block_list.append(the_block)
             elif the_block_class == 'SuperResConvKXKX':
                 spp = with_spp if idx == len(structure_info) - 1 else False
@@ -325,8 +334,8 @@ class TinyNAS(nn.Module):
 def load_tinynas_net(backbone_cfg):
     # load masternet model to path
     import ast
-
-    struct_str = ''.join([x.strip() for x in backbone_cfg.net_structure_str])
+    net_structure_str = read_file(backbone_cfg.structure_file)
+    struct_str = ''.join([x.strip() for x in net_structure_str])
     struct_info = ast.literal_eval(struct_str)
     for layer in struct_info:
         if 'nbitsA' in layer:
@@ -342,6 +351,6 @@ def load_tinynas_net(backbone_cfg):
         use_focus=backbone_cfg.use_focus,
         act=backbone_cfg.act,
         need_conv1=backbone_cfg.need_conv1,
-    )
+        reparam=backbone_cfg.reparam)
 
     return model
diff --git a/modelscope/models/cv/tinynas_detection/detector.py b/modelscope/models/cv/tinynas_detection/detector.py
index 615b13a8..42a71381 100644
--- a/modelscope/models/cv/tinynas_detection/detector.py
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -30,7 +30,7 @@ class SingleStageDetector(TorchModel):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        config_path = osp.join(model_dir, 'airdet_s.py')
+        config_path = osp.join(model_dir, self.config_name)
         config = parse_config(config_path)
         self.cfg = config
         model_path = osp.join(model_dir, config.model.name)
@@ -41,6 +41,9 @@ class SingleStageDetector(TorchModel):
         self.conf_thre = config.model.head.nms_conf_thre
         self.nms_thre = config.model.head.nms_iou_thre
 
+        if self.cfg.model.backbone.name == 'TinyNAS':
+            self.cfg.model.backbone.structure_file = osp.join(
+                model_dir, self.cfg.model.backbone.structure_file)
         self.backbone = build_backbone(self.cfg.model.backbone)
         self.neck = build_neck(self.cfg.model.neck)
         self.head = build_head(self.cfg.model.head)
diff --git a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
index 41f35968..66904ed1 100644
--- a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
+++ b/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
@@ -124,11 +124,13 @@ class GFocalHead_Tiny(nn.Module):
             simOTA_iou_weight=3.0,
             octbase=8,
             simlqe=False,
+            use_lqe=True,
             **kwargs):
         self.simlqe = simlqe
         self.num_classes = num_classes
         self.in_channels = in_channels
         self.strides = strides
+        self.use_lqe = use_lqe
         self.feat_channels = feat_channels if isinstance(feat_channels, list) \
             else [feat_channels] * len(self.strides)
 
@@ -181,15 +183,20 @@ class GFocalHead_Tiny(nn.Module):
                     groups=self.conv_groups,
                     norm=self.norm,
                     act=self.act))
-        if not self.simlqe:
-            conf_vector = [nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)]
+        if self.use_lqe:
+            if not self.simlqe:
+                conf_vector = [
+                    nn.Conv2d(4 * self.total_dim, self.reg_channels, 1)
+                ]
+            else:
+                conf_vector = [
+                    nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
+                ]
+            conf_vector += [self.relu]
+            conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
+            reg_conf = nn.Sequential(*conf_vector)
         else:
-            conf_vector = [
-                nn.Conv2d(4 * (self.reg_max + 1), self.reg_channels, 1)
-            ]
-        conf_vector += [self.relu]
-        conf_vector += [nn.Conv2d(self.reg_channels, 1, 1), nn.Sigmoid()]
-        reg_conf = nn.Sequential(*conf_vector)
+            reg_conf = None
 
         return cls_convs, reg_convs, reg_conf
 
@@ -290,21 +297,27 @@ class GFocalHead_Tiny(nn.Module):
         N, C, H, W = bbox_pred.size()
         prob = F.softmax(
             bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
-        if not self.simlqe:
-            prob_topk, _ = prob.topk(self.reg_topk, dim=2)
-
-            if self.add_mean:
-                stat = torch.cat(
-                    [prob_topk, prob_topk.mean(dim=2, keepdim=True)], dim=2)
+        if self.use_lqe:
+            if not self.simlqe:
+                prob_topk, _ = prob.topk(self.reg_topk, dim=2)
+
+                if self.add_mean:
+                    stat = torch.cat(
+                        [prob_topk,
+                         prob_topk.mean(dim=2, keepdim=True)],
+                        dim=2)
+                else:
+                    stat = prob_topk
+
+                quality_score = reg_conf(
+                    stat.reshape(N, 4 * self.total_dim, H, W))
             else:
-                stat = prob_topk
+                quality_score = reg_conf(
+                    bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))
 
-            quality_score = reg_conf(stat.reshape(N, 4 * self.total_dim, H, W))
+            cls_score = gfl_cls(cls_feat).sigmoid() * quality_score
         else:
-            quality_score = reg_conf(
-                bbox_pred.reshape(N, 4 * (self.reg_max + 1), H, W))
-
-        cls_score = gfl_cls(cls_feat).sigmoid() * quality_score
+            cls_score = gfl_cls(cls_feat).sigmoid()
 
         flatten_cls_score = cls_score.flatten(start_dim=2).transpose(1, 2)
         flatten_bbox_pred = bbox_pred.flatten(start_dim=2).transpose(1, 2)
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
index b710572f..b88c39f2 100644
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
+++ b/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_v2.py
@@ -14,7 +14,6 @@ class GiraffeNeckV2(nn.Module):
         self,
         depth=1.0,
         width=1.0,
-        in_features=[2, 3, 4],
         in_channels=[256, 512, 1024],
         out_channels=[256, 512, 1024],
         depthwise=False,
@@ -24,7 +23,6 @@ class GiraffeNeckV2(nn.Module):
         block_name='BasicBlock',
     ):
         super().__init__()
-        self.in_features = in_features
         self.in_channels = in_channels
         Conv = DWConv if depthwise else BaseConv
 
@@ -169,8 +167,7 @@ class GiraffeNeckV2(nn.Module):
         """
 
         #  backbone
-        features = [out_features[f] for f in self.in_features]
-        [x2, x1, x0] = features
+        [x2, x1, x0] = out_features
 
         # node x3
         x13 = self.bu_conv13(x1)
diff --git a/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py b/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
new file mode 100644
index 00000000..9effad3a
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
@@ -0,0 +1,15 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .detector import SingleStageDetector
+
+
+@MODELS.register_module(
+    Tasks.image_object_detection, module_name=Models.tinynas_damoyolo)
+class DamoYolo(SingleStageDetector):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        self.config_name = 'damoyolo_s.py'
+        super(DamoYolo, self).__init__(model_dir, *args, **kwargs)
diff --git a/modelscope/models/cv/tinynas_detection/tinynas_detector.py b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
index e6f144df..92acf3fa 100644
--- a/modelscope/models/cv/tinynas_detection/tinynas_detector.py
+++ b/modelscope/models/cv/tinynas_detection/tinynas_detector.py
@@ -12,5 +12,5 @@ from .detector import SingleStageDetector
 class TinynasDetector(SingleStageDetector):
 
     def __init__(self, model_dir, *args, **kwargs):
-
+        self.config_name = 'airdet_s.py'
         super(TinynasDetector, self).__init__(model_dir, *args, **kwargs)
diff --git a/modelscope/pipelines/cv/tinynas_detection_pipeline.py b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
index b2063629..d35d4d36 100644
--- a/modelscope/pipelines/cv/tinynas_detection_pipeline.py
+++ b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
@@ -12,6 +12,8 @@ from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import \
+    show_image_object_detection_auto_result
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -52,10 +54,18 @@ class TinynasDetectionPipeline(Pipeline):
 
         bboxes, scores, labels = self.model.postprocess(inputs['data'])
         if bboxes is None:
-            return None
-        outputs = {
-            OutputKeys.SCORES: scores,
-            OutputKeys.LABELS: labels,
-            OutputKeys.BOXES: bboxes
-        }
+            outputs = {
+                OutputKeys.SCORES: [],
+                OutputKeys.LABELS: [],
+                OutputKeys.BOXES: []
+            }
+        else:
+            outputs = {
+                OutputKeys.SCORES: scores,
+                OutputKeys.LABELS: labels,
+                OutputKeys.BOXES: bboxes
+            }
         return outputs
+
+    def show_result(self, img_path, result, save_path=None):
+        show_image_object_detection_auto_result(img_path, result, save_path)
diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py
index 9b82f8d2..cf59dc57 100644
--- a/modelscope/utils/file_utils.py
+++ b/modelscope/utils/file_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import inspect
+import os
 from pathlib import Path
 
 
@@ -35,3 +36,10 @@ def get_default_cache_dir():
     """
     default_cache_dir = Path.home().joinpath('.cache', 'modelscope')
     return default_cache_dir
+
+
+def read_file(path):
+
+    with open(path, 'r') as f:
+        text = f.read()
+    return text
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index 63db9145..43e1842d 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -4,22 +4,45 @@ import unittest
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
 from modelscope.utils.test_utils import test_level
 
 
-class TinynasObjectDetectionTest(unittest.TestCase):
+class TinynasObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_object_detection
+        self.model_id = 'damo/cv_tinynas_object-detection_damoyolo'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run(self):
+    def test_run_airdet(self):
         tinynas_object_detection = pipeline(
             Tasks.image_object_detection, model='damo/cv_tinynas_detection')
         result = tinynas_object_detection(
             'data/test/images/image_detection.jpg')
         print(result)
 
+    @unittest.skip('will be enabled after damoyolo officially released')
+    def test_run_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.image_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        print(result)
+
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
-        self.test_demo()
+        self.compatibility_check()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_object_detection_auto_pipeline(self):
+        test_image = 'data/test/images/image_detection.jpg'
+        tinynas_object_detection = pipeline(
+            Tasks.image_object_detection, model='damo/cv_tinynas_detection')
+        result = tinynas_object_detection(test_image)
+        tinynas_object_detection.show_result(test_image, result,
+                                             'demo_ret.jpg')
 
 
 if __name__ == '__main__':

From 5f937b20cf5a7eeed034af5fccd11110b780c08e Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 18 Oct 2022 17:47:23 +0800
Subject: [PATCH 067/129] [to #45549080]fix: fix pre_commit dependency
 importlib-metadata 5.0.0 compability issue         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10443005

---
 .pre-commit-config.yaml       | 2 +-
 .pre-commit-config_local.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c6290ff4..48fe7547 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://gitlab.com/pycqa/flake8.git
-    rev: 3.8.3
+    rev: 4.0.0
     hooks:
       - id: flake8
         exclude: thirdparty/|examples/
diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml
index 138561e3..0b2e2f39 100644
--- a/.pre-commit-config_local.yaml
+++ b/.pre-commit-config_local.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: /home/admin/pre-commit/flake8
-    rev: 3.8.3
+    rev: 4.0.0
     hooks:
       - id: flake8
         exclude: thirdparty/|examples/

From 9809d960b0d60af546ba16e9f9c9f64e8232c1e7 Mon Sep 17 00:00:00 2001
From: "xingjun.wxj" <xingjun.wxj@alibaba-inc.com>
Date: Wed, 19 Oct 2022 10:09:06 +0800
Subject: [PATCH 068/129] [to #42322933] Fix issue -- it's not necessary to
 login for loading public datasets.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复”公开数据集需要登录才能load“的问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10448936
---
 modelscope/hub/api.py                        | 8 +++++---
 modelscope/msdatasets/utils/dataset_utils.py | 8 +++-----
 modelscope/utils/constant.py                 | 7 -------
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index dc4d0ab2..2aab142e 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -390,11 +390,13 @@ class HubApi:
         return resp['Data']
 
     def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
-                                 is_recursive, is_filter_dir, revision,
-                                 cookies):
+                                 is_recursive, is_filter_dir, revision):
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
             f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
-        cookies = requests.utils.dict_from_cookiejar(cookies)
+
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies:
+            cookies = requests.utils.dict_from_cookiejar(cookies)
 
         resp = requests.get(url=url, cookies=cookies)
         resp = resp.json()
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index db9d1fee..c7aa7682 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -7,7 +7,7 @@ from typing import Any, Mapping, Optional, Sequence, Union
 from datasets.builder import DatasetBuilder
 
 from modelscope.hub.api import HubApi
-from modelscope.utils.constant import DEFAULT_DATASET_REVISION, DownloadParams
+from modelscope.utils.constant import DEFAULT_DATASET_REVISION
 from modelscope.utils.logger import get_logger
 from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 
@@ -95,15 +95,13 @@ def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool,
         res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...]
     """
     res = []
-    cookies = hub_api.check_cookies_upload_data(use_cookies=True)
     objects = hub_api.list_oss_dataset_objects(
         dataset_name=dataset_name,
         namespace=namespace,
         max_limit=max_limit,
         is_recursive=is_recursive,
         is_filter_dir=True,
-        revision=version,
-        cookies=cookies)
+        revision=version)
 
     for item in objects:
         object_key = item.get('Key')
@@ -174,7 +172,7 @@ def get_dataset_files(subset_split_into: dict,
     modelscope_api = HubApi()
     objects = list_dataset_objects(
         hub_api=modelscope_api,
-        max_limit=DownloadParams.MAX_LIST_OBJECTS_NUM.value,
+        max_limit=-1,
         is_recursive=True,
         dataset_name=dataset_name,
         namespace=namespace,
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 6ba58c19..6c0f3e98 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -231,13 +231,6 @@ class DownloadMode(enum.Enum):
     FORCE_REDOWNLOAD = 'force_redownload'
 
 
-class DownloadParams(enum.Enum):
-    """
-        Parameters for downloading dataset.
-    """
-    MAX_LIST_OBJECTS_NUM = 50000
-
-
 class DatasetFormations(enum.Enum):
     """ How a dataset is organized and interpreted
     """

From 63ac21147d93fb8a4c968c22a80f1b51e271dc2c Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Wed, 19 Oct 2022 13:24:23 +0800
Subject: [PATCH 069/129] [to #42322933] fix some logs

---
 modelscope/utils/device.py   | 11 +++++------
 modelscope/utils/registry.py |  9 ++++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index 33c0910d..6fc59e37 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -61,8 +61,8 @@ def device_placement(framework, device_name='gpu:0'):
     if framework == Frameworks.tf:
         import tensorflow as tf
         if device_type == Devices.gpu and not tf.test.is_gpu_available():
-            logger.warning(
-                'tensorflow cuda is not available, using cpu instead.')
+            logger.debug(
+                'tensorflow: cuda is not available, using cpu instead.')
         device_type = Devices.cpu
         if device_type == Devices.cpu:
             with tf.device('/CPU:0'):
@@ -78,7 +78,8 @@ def device_placement(framework, device_name='gpu:0'):
             if torch.cuda.is_available():
                 torch.cuda.set_device(f'cuda:{device_id}')
             else:
-                logger.warning('cuda is not available, using cpu instead.')
+                logger.debug(
+                    'pytorch: cuda is not available, using cpu instead.')
         yield
     else:
         yield
@@ -96,9 +97,7 @@ def create_device(device_name):
     if device_type == Devices.gpu:
         use_cuda = True
         if not torch.cuda.is_available():
-            logger.warning(
-                'cuda is not available, create gpu device failed, using cpu instead.'
-            )
+            logger.info('cuda is not available, using cpu instead.')
             use_cuda = False
 
     if use_cuda:
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 7a9c79e2..73e94b3c 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -176,7 +176,7 @@ def build_from_cfg(cfg,
         raise TypeError('default_args must be a dict or None, '
                         f'but got {type(default_args)}')
 
-    # dynamic load installation reqruiements for this module
+    # dynamic load installation requirements for this module
     from modelscope.utils.import_utils import LazyImportModule
     sig = (registry.name.upper(), group_key, cfg['type'])
     LazyImportModule.import_module(sig)
@@ -193,8 +193,11 @@ def build_from_cfg(cfg,
     if isinstance(obj_type, str):
         obj_cls = registry.get(obj_type, group_key=group_key)
         if obj_cls is None:
-            raise KeyError(f'{obj_type} is not in the {registry.name}'
-                           f' registry group {group_key}')
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name}'
+                f' registry group {group_key}. Please make'
+                f' sure the correct version of 1qqQModelScope library is used.'
+            )
         obj_cls.group_key = group_key
     elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
         obj_cls = obj_type

From 87b4a52b3c22446a2265cdf90500b6655497bfd7 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Wed, 19 Oct 2022 23:29:58 +0800
Subject: [PATCH 070/129] Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10462681

---
 modelscope/hub/git.py        |  7 +++++--
 tests/hub/test_hub_upload.py | 39 +++++++++---------------------------
 2 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index db76506e..1369fddf 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -184,8 +184,11 @@ class GitCommandWrapper(metaclass=Singleton):
         info = [
             line.strip()
             for line in rsp.stdout.decode('utf8').strip().split(os.linesep)
-        ][1:]
-        return ['/'.join(line.split('/')[1:]) for line in info]
+        ]
+        if len(info) == 1:
+            return ['/'.join(info[0].split('/')[1:])]
+        else:
+            return ['/'.join(line.split('/')[1:]) for line in info[1:]]
 
     def pull(self, repo_dir: str):
         cmds = ['-C', repo_dir, 'pull']
diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
index 2250164b..a10f7c81 100644
--- a/tests/hub/test_hub_upload.py
+++ b/tests/hub/test_hub_upload.py
@@ -7,12 +7,13 @@ import uuid
 
 from modelscope.hub.api import HubApi
 from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.errors import HTTPError, NotLoginException
 from modelscope.hub.repository import Repository
 from modelscope.hub.upload import upload_folder
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
-from .test_utils import TEST_ACCESS_TOKEN1, delete_credential
+from .test_utils import TEST_ACCESS_TOKEN1, TEST_MODEL_ORG, delete_credential
 
 logger = get_logger()
 
@@ -22,7 +23,7 @@ class HubUploadTest(unittest.TestCase):
     def setUp(self):
         logger.info('SetUp')
         self.api = HubApi()
-        self.user = os.environ.get('TEST_MODEL_ORG', 'citest')
+        self.user = TEST_MODEL_ORG
         logger.info(self.user)
         self.create_model_name = '%s/%s_%s' % (self.user, 'test_model_upload',
                                                uuid.uuid4().hex)
@@ -39,7 +40,10 @@ class HubUploadTest(unittest.TestCase):
     def tearDown(self):
         logger.info('TearDown')
         shutil.rmtree(self.model_dir, ignore_errors=True)
-        self.api.delete_model(model_id=self.create_model_name)
+        try:
+            self.api.delete_model(model_id=self.create_model_name)
+        except Exception:
+            pass
 
     def test_upload_exits_repo_master(self):
         logger.info('basic test for upload!')
@@ -119,48 +123,23 @@ class HubUploadTest(unittest.TestCase):
         logger.info('test upload without login!')
         self.api.login(TEST_ACCESS_TOKEN1)
         delete_credential()
-        try:
-            upload_folder(
-                model_id=self.create_model_name,
-                model_dir=self.finetune_path,
-                visibility=ModelVisibility.PUBLIC,
-                license=Licenses.APACHE_V2)
-        except Exception as e:
-            logger.info(e)
-            self.api.login(TEST_ACCESS_TOKEN1)
+        with self.assertRaises(NotLoginException):
             upload_folder(
                 model_id=self.create_model_name,
                 model_dir=self.finetune_path,
                 visibility=ModelVisibility.PUBLIC,
                 license=Licenses.APACHE_V2)
-            Repository(
-                model_dir=self.repo_path, clone_from=self.create_model_name)
-            assert os.path.exists(
-                os.path.join(self.repo_path, 'configuration.json'))
-            shutil.rmtree(self.repo_path, ignore_errors=True)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_upload_invalid_repo(self):
         logger.info('test upload to invalid repo!')
         self.api.login(TEST_ACCESS_TOKEN1)
-        try:
+        with self.assertRaises(HTTPError):
             upload_folder(
                 model_id='%s/%s' % ('speech_tts', 'invalid_model_test'),
                 model_dir=self.finetune_path,
                 visibility=ModelVisibility.PUBLIC,
                 license=Licenses.APACHE_V2)
-        except Exception as e:
-            logger.info(e)
-            upload_folder(
-                model_id=self.create_model_name,
-                model_dir=self.finetune_path,
-                visibility=ModelVisibility.PUBLIC,
-                license=Licenses.APACHE_V2)
-            Repository(
-                model_dir=self.repo_path, clone_from=self.create_model_name)
-            assert os.path.exists(
-                os.path.join(self.repo_path, 'configuration.json'))
-            shutil.rmtree(self.repo_path, ignore_errors=True)
 
 
 if __name__ == '__main__':

From 089cadab4b0f0e6d17a76ef0ef88ae0c1c9d4fe5 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Thu, 20 Oct 2022 08:51:25 +0800
Subject: [PATCH 071/129] [to #42322933] disable unstable trainer test

---
 tests/trainers/test_image_denoise_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index 68ddf616..c4abca6a 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -51,7 +51,7 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
@@ -65,7 +65,7 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         for i in range(2):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         model = NAFNetForImageDenoise.from_pretrained(self.cache_path)
         kwargs = dict(

From 3cd5f73da076cc550b747473199585a80bc90917 Mon Sep 17 00:00:00 2001
From: "yingda.chen" <yingda.chen@alibaba-inc.com>
Date: Thu, 20 Oct 2022 10:28:15 +0800
Subject: [PATCH 072/129] [to #42322933] refactor push_model         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10463992

---
 modelscope/hub/api.py               | 128 +++++++++++++++++++++++++---
 modelscope/hub/file_download.py     |   3 +-
 modelscope/hub/git.py               |   4 +-
 modelscope/hub/repository.py        |   4 +-
 modelscope/hub/snapshot_download.py |   2 +-
 modelscope/hub/upload.py            | 117 -------------------------
 tests/hub/test_hub_operation.py     |   2 +-
 tests/hub/test_hub_upload.py        |  15 ++--
 8 files changed, 130 insertions(+), 145 deletions(-)
 delete mode 100644 modelscope/hub/upload.py

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index 2aab142e..f8ca683a 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,8 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+# yapf: disable
+import datetime
 import os
 import pickle
 import shutil
+import tempfile
 from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
@@ -16,17 +19,25 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
-                                      DEFAULT_CREDENTIALS_PATH)
+                                      DEFAULT_CREDENTIALS_PATH, Licenses,
+                                      ModelVisibility)
+from modelscope.hub.errors import (InvalidParameter, NotExistError,
+                                   NotLoginException, RequestError,
+                                   datahub_raise_on_error,
+                                   handle_http_post_error,
+                                   handle_http_response, is_ok, raise_on_error)
+from modelscope.hub.git import GitCommandWrapper
+from modelscope.hub.repository import Repository
+from modelscope.hub.utils.utils import (get_endpoint,
+                                        model_id_to_group_owner_name)
 from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DatasetFormations, DatasetMetaFormats,
-                                       DownloadMode)
+                                       DownloadMode, ModelFile)
 from modelscope.utils.logger import get_logger
-from .errors import (InvalidParameter, NotExistError, RequestError,
-                     datahub_raise_on_error, handle_http_post_error,
-                     handle_http_response, is_ok, raise_on_error)
-from .utils.utils import get_endpoint, model_id_to_group_owner_name
+
+# yapf: enable
 
 logger = get_logger()
 
@@ -169,11 +180,106 @@ class HubApi:
         else:
             r.raise_for_status()
 
-    def list_model(self,
-                   owner_or_group: str,
-                   page_number=1,
-                   page_size=10) -> dict:
-        """List model in owner or group.
+    def push_model(self,
+                   model_id: str,
+                   model_dir: str,
+                   visibility: int = ModelVisibility.PUBLIC,
+                   license: str = Licenses.APACHE_V2,
+                   chinese_name: Optional[str] = None,
+                   commit_message: Optional[str] = 'upload model',
+                   revision: Optional[str] = DEFAULT_MODEL_REVISION):
+        """
+        Upload model from a given directory to given repository. A valid model directory
+        must contain a configuration.json file.
+
+        This function upload the files in given directory to given repository. If the
+        given repository is not exists in remote, it will automatically create it with
+        given visibility, license and chinese_name parameters. If the revision is also
+        not exists in remote repository, it will create a new branch for it.
+
+        This function must be called before calling HubApi's login with a valid token
+        which can be obtained from ModelScope's website.
+
+        Args:
+            model_id (`str`):
+                The model id to be uploaded, caller must have write permission for it.
+            model_dir(`str`):
+                The Absolute Path of the finetune result.
+            visibility(`int`, defaults to `0`):
+                Visibility of the new created model(1-private, 5-public). If the model is
+                not exists in ModelScope, this function will create a new model with this
+                visibility and this parameter is required. You can ignore this parameter
+                if you make sure the model's existence.
+            license(`str`, defaults to `None`):
+                License of the new created model(see License). If the model is not exists
+                in ModelScope, this function will create a new model with this license
+                and this parameter is required. You can ignore this parameter if you
+                make sure the model's existence.
+            chinese_name(`str`, *optional*, defaults to `None`):
+                chinese name of the new created model.
+            commit_message(`str`, *optional*, defaults to `None`):
+                commit message of the push request.
+            revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
+                which branch to push. If the branch is not exists, It will create a new
+                branch and push to it.
+        """
+        if model_id is None:
+            raise InvalidParameter('model_id cannot be empty!')
+        if model_dir is None:
+            raise InvalidParameter('model_dir cannot be empty!')
+        if not os.path.exists(model_dir) or os.path.isfile(model_dir):
+            raise InvalidParameter('model_dir must be a valid directory.')
+        cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        if not os.path.exists(cfg_file):
+            raise ValueError(f'{model_dir} must contain a configuration.json.')
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise NotLoginException('Must login before upload!')
+        files_to_save = os.listdir(model_dir)
+        try:
+            self.get_model(model_id=model_id)
+        except Exception:
+            if visibility is None or license is None:
+                raise InvalidParameter(
+                    'visibility and license cannot be empty if want to create new repo'
+                )
+            logger.info('Create new model %s' % model_id)
+            self.create_model(
+                model_id=model_id,
+                visibility=visibility,
+                license=license,
+                chinese_name=chinese_name)
+        tmp_dir = tempfile.mkdtemp()
+        git_wrapper = GitCommandWrapper()
+        try:
+            repo = Repository(model_dir=tmp_dir, clone_from=model_id)
+            branches = git_wrapper.get_remote_branches(tmp_dir)
+            if revision not in branches:
+                logger.info('Create new branch %s' % revision)
+                git_wrapper.new_branch(tmp_dir, revision)
+            git_wrapper.checkout(tmp_dir, revision)
+            for f in files_to_save:
+                if f[0] != '.':
+                    src = os.path.join(model_dir, f)
+                    if os.path.isdir(src):
+                        shutil.copytree(src, os.path.join(tmp_dir, f))
+                    else:
+                        shutil.copy(src, tmp_dir)
+            if not commit_message:
+                date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
+                commit_message = '[automsg] push model %s to hub at %s' % (
+                    model_id, date)
+            repo.push(commit_message=commit_message, branch=revision)
+        except Exception:
+            raise
+        finally:
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+
+    def list_models(self,
+                    owner_or_group: str,
+                    page_number=1,
+                    page_size=10) -> dict:
+        """List models in owner or group.
 
         Args:
             owner_or_group(`str`): owner or group.
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 1cc5645b..8ffc60bc 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -11,13 +11,12 @@ from typing import Dict, Optional, Union
 from uuid import uuid4
 
 import requests
-from filelock import FileLock
 from tqdm import tqdm
 
 from modelscope import __version__
+from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
 from modelscope.utils.logger import get_logger
-from .api import HubApi, ModelScopeConfig
 from .constants import FILE_HASH
 from .errors import FileDownloadError, NotExistError
 from .utils.caching import ModelFileSystemCache
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 1369fddf..fe1d1554 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -1,13 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-import re
 import subprocess
 from typing import List
-from xmlrpc.client import Boolean
 
 from modelscope.utils.logger import get_logger
-from .api import ModelScopeConfig
 from .errors import GitError
 
 logger = get_logger()
@@ -132,6 +129,7 @@ class GitCommandWrapper(metaclass=Singleton):
         return response
 
     def add_user_info(self, repo_base_dir, repo_name):
+        from modelscope.hub.api import ModelScopeConfig
         user_name, user_email = ModelScopeConfig.get_user_info()
         if user_name and user_email:
             # config user.name and user.email if exist
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index d92089ed..35c831a9 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -7,7 +7,6 @@ from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION)
 from modelscope.utils.logger import get_logger
-from .api import ModelScopeConfig
 from .git import GitCommandWrapper
 from .utils.utils import get_endpoint
 
@@ -47,6 +46,7 @@ class Repository:
             err_msg = 'a non-default value of revision cannot be empty.'
             raise InvalidParameter(err_msg)
 
+        from modelscope.hub.api import ModelScopeConfig
         if auth_token:
             self.auth_token = auth_token
         else:
@@ -166,7 +166,7 @@ class DatasetRepository:
             err_msg = 'a non-default value of revision cannot be empty.'
             raise InvalidParameter(err_msg)
         self.revision = revision
-
+        from modelscope.hub.api import ModelScopeConfig
         if auth_token:
             self.auth_token = auth_token
         else:
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index cde6ad34..ac57d1b1 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -5,9 +5,9 @@ import tempfile
 from pathlib import Path
 from typing import Dict, Optional, Union
 
+from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
 from modelscope.utils.logger import get_logger
-from .api import HubApi, ModelScopeConfig
 from .constants import FILE_HASH
 from .errors import NotExistError
 from .file_download import (get_file_download_url, http_get_file,
diff --git a/modelscope/hub/upload.py b/modelscope/hub/upload.py
deleted file mode 100644
index 9dffc60e..00000000
--- a/modelscope/hub/upload.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import datetime
-import os
-import shutil
-import tempfile
-import uuid
-from typing import Dict, Optional
-from uuid import uuid4
-
-from filelock import FileLock
-
-from modelscope import __version__
-from modelscope.hub.api import HubApi, ModelScopeConfig
-from modelscope.hub.errors import InvalidParameter, NotLoginException
-from modelscope.hub.git import GitCommandWrapper
-from modelscope.hub.repository import Repository
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-
-def upload_folder(model_id: str,
-                  model_dir: str,
-                  visibility: int = 0,
-                  license: str = None,
-                  chinese_name: Optional[str] = None,
-                  commit_message: Optional[str] = None,
-                  revision: Optional[str] = DEFAULT_MODEL_REVISION):
-    """
-    Upload model from a given directory to given repository. A valid model directory
-    must contain a configuration.json file.
-
-    This function upload the files in given directory to given repository. If the
-    given repository is not exists in remote, it will automatically create it with
-    given visibility, license and chinese_name parameters. If the revision is also
-    not exists in remote repository, it will create a new branch for it.
-
-    This function must be called before calling HubApi's login with a valid token
-    which can be obtained from ModelScope's website.
-
-    Args:
-        model_id (`str`):
-            The model id to be uploaded, caller must have write permission for it.
-        model_dir(`str`):
-            The Absolute Path of the finetune result.
-        visibility(`int`, defaults to `0`):
-            Visibility of the new created model(1-private, 5-public). If the model is
-            not exists in ModelScope, this function will create a new model with this
-            visibility and this parameter is required. You can ignore this parameter
-            if you make sure the model's existence.
-        license(`str`, defaults to `None`):
-            License of the new created model(see License). If the model is not exists
-            in ModelScope, this function will create a new model with this license
-            and this parameter is required. You can ignore this parameter if you
-            make sure the model's existence.
-        chinese_name(`str`, *optional*, defaults to `None`):
-            chinese name of the new created model.
-        commit_message(`str`, *optional*, defaults to `None`):
-            commit message of the push request.
-        revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
-            which branch to push. If the branch is not exists, It will create a new
-            branch and push to it.
-    """
-    if model_id is None:
-        raise InvalidParameter('model_id cannot be empty!')
-    if model_dir is None:
-        raise InvalidParameter('model_dir cannot be empty!')
-    if not os.path.exists(model_dir) or os.path.isfile(model_dir):
-        raise InvalidParameter('model_dir must be a valid directory.')
-    cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
-    if not os.path.exists(cfg_file):
-        raise ValueError(f'{model_dir} must contain a configuration.json.')
-    cookies = ModelScopeConfig.get_cookies()
-    if cookies is None:
-        raise NotLoginException('Must login before upload!')
-    files_to_save = os.listdir(model_dir)
-    api = HubApi()
-    try:
-        api.get_model(model_id=model_id)
-    except Exception:
-        if visibility is None or license is None:
-            raise InvalidParameter(
-                'visibility and license cannot be empty if want to create new repo'
-            )
-        logger.info('Create new model %s' % model_id)
-        api.create_model(
-            model_id=model_id,
-            visibility=visibility,
-            license=license,
-            chinese_name=chinese_name)
-    tmp_dir = tempfile.mkdtemp()
-    git_wrapper = GitCommandWrapper()
-    try:
-        repo = Repository(model_dir=tmp_dir, clone_from=model_id)
-        branches = git_wrapper.get_remote_branches(tmp_dir)
-        if revision not in branches:
-            logger.info('Create new branch %s' % revision)
-            git_wrapper.new_branch(tmp_dir, revision)
-        git_wrapper.checkout(tmp_dir, revision)
-        for f in files_to_save:
-            if f[0] != '.':
-                src = os.path.join(model_dir, f)
-                if os.path.isdir(src):
-                    shutil.copytree(src, os.path.join(tmp_dir, f))
-                else:
-                    shutil.copy(src, tmp_dir)
-        if not commit_message:
-            date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
-            commit_message = '[automsg] push model %s to hub at %s' % (
-                model_id, date)
-        repo.push(commit_message=commit_message, branch=revision)
-    except Exception:
-        raise
-    finally:
-        shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
index c96db986..f2bdb2d3 100644
--- a/tests/hub/test_hub_operation.py
+++ b/tests/hub/test_hub_operation.py
@@ -127,7 +127,7 @@ class HubOperationTest(unittest.TestCase):
         return None
 
     def test_list_model(self):
-        data = self.api.list_model(TEST_MODEL_ORG)
+        data = self.api.list_models(TEST_MODEL_ORG)
         assert len(data['Models']) >= 1
 
 
diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
index a10f7c81..e1f61467 100644
--- a/tests/hub/test_hub_upload.py
+++ b/tests/hub/test_hub_upload.py
@@ -9,7 +9,6 @@ from modelscope.hub.api import HubApi
 from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.errors import HTTPError, NotLoginException
 from modelscope.hub.repository import Repository
-from modelscope.hub.upload import upload_folder
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.test_utils import test_level
@@ -54,14 +53,14 @@ class HubUploadTest(unittest.TestCase):
             license=Licenses.APACHE_V2)
         os.system("echo '111'>%s"
                   % os.path.join(self.finetune_path, 'add1.py'))
-        upload_folder(
+        self.api.push_model(
             model_id=self.create_model_name, model_dir=self.finetune_path)
         Repository(model_dir=self.repo_path, clone_from=self.create_model_name)
         assert os.path.exists(os.path.join(self.repo_path, 'add1.py'))
         shutil.rmtree(self.repo_path, ignore_errors=True)
         os.system("echo '222'>%s"
                   % os.path.join(self.finetune_path, 'add2.py'))
-        upload_folder(
+        self.api.push_model(
             model_id=self.create_model_name,
             model_dir=self.finetune_path,
             revision='new_revision/version1')
@@ -73,7 +72,7 @@ class HubUploadTest(unittest.TestCase):
         shutil.rmtree(self.repo_path, ignore_errors=True)
         os.system("echo '333'>%s"
                   % os.path.join(self.finetune_path, 'add3.py'))
-        upload_folder(
+        self.api.push_model(
             model_id=self.create_model_name,
             model_dir=self.finetune_path,
             revision='new_revision/version2',
@@ -88,7 +87,7 @@ class HubUploadTest(unittest.TestCase):
         add4_path = os.path.join(self.finetune_path, 'temp')
         os.mkdir(add4_path)
         os.system("echo '444'>%s" % os.path.join(add4_path, 'add4.py'))
-        upload_folder(
+        self.api.push_model(
             model_id=self.create_model_name,
             model_dir=self.finetune_path,
             revision='new_revision/version1')
@@ -105,7 +104,7 @@ class HubUploadTest(unittest.TestCase):
         self.api.login(TEST_ACCESS_TOKEN1)
         os.system("echo '111'>%s"
                   % os.path.join(self.finetune_path, 'add1.py'))
-        upload_folder(
+        self.api.push_model(
             model_id=self.create_model_name,
             model_dir=self.finetune_path,
             revision='new_model_new_revision',
@@ -124,7 +123,7 @@ class HubUploadTest(unittest.TestCase):
         self.api.login(TEST_ACCESS_TOKEN1)
         delete_credential()
         with self.assertRaises(NotLoginException):
-            upload_folder(
+            self.api.push_model(
                 model_id=self.create_model_name,
                 model_dir=self.finetune_path,
                 visibility=ModelVisibility.PUBLIC,
@@ -135,7 +134,7 @@ class HubUploadTest(unittest.TestCase):
         logger.info('test upload to invalid repo!')
         self.api.login(TEST_ACCESS_TOKEN1)
         with self.assertRaises(HTTPError):
-            upload_folder(
+            self.api.push_model(
                 model_id='%s/%s' % ('speech_tts', 'invalid_model_test'),
                 model_dir=self.finetune_path,
                 visibility=ModelVisibility.PUBLIC,

From eb7bd5b8256d474e4382ee73f19860bed3d21689 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Thu, 20 Oct 2022 11:46:50 +0800
Subject: [PATCH 073/129] [to #42322933]Add lock to tts's AM and model loaded
 since its AR part not lockfree

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10464620
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10464620
---
 modelscope/models/audio/tts/voice.py | 78 +++++++++++++++-------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/modelscope/models/audio/tts/voice.py b/modelscope/models/audio/tts/voice.py
index dc830db5..b7240088 100644
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -2,6 +2,7 @@
 
 import os
 import pickle as pkl
+from threading import Lock
 
 import json
 import numpy as np
@@ -27,6 +28,7 @@ class Voice:
         self.__am_config = AttrDict(**am_config)
         self.__voc_config = AttrDict(**voc_config)
         self.__model_loaded = False
+        self.__lock = Lock()
         if 'am' not in self.__am_config:
             raise TtsModelConfigurationException(
                 'modelscope error: am configuration invalid')
@@ -71,34 +73,35 @@ class Voice:
         self.__generator.remove_weight_norm()
 
     def __am_forward(self, symbol_seq):
-        with torch.no_grad():
-            inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
-                symbol_seq)
-            inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
-                self.__device)
-            inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
-                self.__device)
-            inputs_syllable = torch.from_numpy(inputs_feat_lst[2]).long().to(
-                self.__device)
-            inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
-                self.__device)
-            inputs_ling = torch.stack(
-                [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
-                dim=-1).unsqueeze(0)
-            inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
-                self.__device).unsqueeze(0)
-            inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
-                self.__device).unsqueeze(0)
-            inputs_len = torch.zeros(1).to(self.__device).long(
-            ) + inputs_emo.size(1) - 1  # minus 1 for "~"
-            res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
-                                inputs_spk[:, :-1], inputs_len)
-            postnet_outputs = res['postnet_outputs']
-            LR_length_rounded = res['LR_length_rounded']
-            valid_length = int(LR_length_rounded[0].item())
-            postnet_outputs = postnet_outputs[
-                0, :valid_length, :].cpu().numpy()
-            return postnet_outputs
+        with self.__lock:
+            with torch.no_grad():
+                inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
+                    symbol_seq)
+                inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
+                    self.__device)
+                inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
+                    self.__device)
+                inputs_syllable = torch.from_numpy(
+                    inputs_feat_lst[2]).long().to(self.__device)
+                inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
+                    self.__device)
+                inputs_ling = torch.stack(
+                    [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
+                    dim=-1).unsqueeze(0)
+                inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
+                    self.__device).unsqueeze(0)
+                inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
+                    self.__device).unsqueeze(0)
+                inputs_len = torch.zeros(1).to(self.__device).long(
+                ) + inputs_emo.size(1) - 1  # minus 1 for "~"
+                res = self.__am_net(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
+                                    inputs_spk[:, :-1], inputs_len)
+                postnet_outputs = res['postnet_outputs']
+                LR_length_rounded = res['LR_length_rounded']
+                valid_length = int(LR_length_rounded[0].item())
+                postnet_outputs = postnet_outputs[
+                    0, :valid_length, :].cpu().numpy()
+                return postnet_outputs
 
     def __vocoder_forward(self, melspec):
         dim0 = list(melspec.shape)[-1]
@@ -118,14 +121,15 @@ class Voice:
             return audio
 
     def forward(self, symbol_seq):
-        if not self.__model_loaded:
-            torch.manual_seed(self.__am_config.seed)
-            if torch.cuda.is_available():
+        with self.__lock:
+            if not self.__model_loaded:
                 torch.manual_seed(self.__am_config.seed)
-                self.__device = torch.device('cuda')
-            else:
-                self.__device = torch.device('cpu')
-            self.__load_am()
-            self.__load_vocoder()
-            self.__model_loaded = True
+                if torch.cuda.is_available():
+                    torch.manual_seed(self.__am_config.seed)
+                    self.__device = torch.device('cuda')
+                else:
+                    self.__device = torch.device('cpu')
+                self.__load_am()
+                self.__load_vocoder()
+                self.__model_loaded = True
         return self.__vocoder_forward(self.__am_forward(symbol_seq))

From 01d521dd7851ee273de3793d7ad123ad99c0de9c Mon Sep 17 00:00:00 2001
From: "shouzhou.bx" <shouzhou.bx@alibaba-inc.com>
Date: Thu, 20 Oct 2022 11:51:52 +0800
Subject: [PATCH 074/129] [to #42322933]add face 2d keypoints finetune test
 case         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10421808

    * add face 2d keypoints & human wholebody keypoint finrtune test case
---
 modelscope/metainfo.py                        |  4 +-
 modelscope/msdatasets/cv/easycv_base.py       | 13 ++--
 requirements/cv.txt                           |  2 +-
 .../test_easycv_trainer_face_2d_keypoints.py  | 71 +++++++++++++++++++
 4 files changed, 83 insertions(+), 7 deletions(-)
 create mode 100644 tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 6c8d91fa..31bef3b8 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -452,9 +452,9 @@ class Datasets(object):
     """ Names for different datasets.
     """
     ClsDataset = 'ClsDataset'
-    Face2dKeypointsDataset = 'Face2dKeypointsDataset'
+    Face2dKeypointsDataset = 'FaceKeypointDataset'
     HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
-    HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
+    HumanWholeBodyKeypointDataset = 'WholeBodyCocoTopDownDataset'
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
diff --git a/modelscope/msdatasets/cv/easycv_base.py b/modelscope/msdatasets/cv/easycv_base.py
index a45827a3..7b6df6e0 100644
--- a/modelscope/msdatasets/cv/easycv_base.py
+++ b/modelscope/msdatasets/cv/easycv_base.py
@@ -26,11 +26,16 @@ class EasyCVBaseDataset(object):
         if self.split_config is not None:
             self._update_data_source(kwargs['data_source'])
 
+    def _update_data_root(self, input_dict, data_root):
+        for k, v in input_dict.items():
+            if isinstance(v, str) and self.DATA_ROOT_PATTERN in v:
+                input_dict.update(
+                    {k: v.replace(self.DATA_ROOT_PATTERN, data_root)})
+            elif isinstance(v, dict):
+                self._update_data_root(v, data_root)
+
     def _update_data_source(self, data_source):
         data_root = next(iter(self.split_config.values()))
         data_root = data_root.rstrip(osp.sep)
 
-        for k, v in data_source.items():
-            if isinstance(v, str) and self.DATA_ROOT_PATTERN in v:
-                data_source.update(
-                    {k: v.replace(self.DATA_ROOT_PATTERN, data_root)})
+        self._update_data_root(data_source, data_root)
diff --git a/requirements/cv.txt b/requirements/cv.txt
index d23fab3a..f29b296b 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -19,7 +19,7 @@ moviepy>=1.0.3
 networkx>=2.5
 numba
 onnxruntime>=1.10
-pai-easycv>=0.6.3.7
+pai-easycv>=0.6.3.9
 pandas
 psutil
 regex
diff --git a/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py b/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py
new file mode 100644
index 00000000..4dffa998
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer_face_2d_keypoints.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode, LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestFace2DKeypoints(unittest.TestCase):
+    model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment'
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+
+    def _train(self, tmp_dir):
+        cfg_options = {'train.max_epochs': 2}
+
+        trainer_name = Trainers.easycv
+
+        train_dataset = MsDataset.load(
+            dataset_name='face_2d_keypoints_dataset',
+            namespace='modelscope',
+            split='train',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        eval_dataset = MsDataset.load(
+            dataset_name='face_2d_keypoints_dataset',
+            namespace='modelscope',
+            split='train',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_single_gpu(self):
+        temp_file_dir = tempfile.TemporaryDirectory()
+        tmp_dir = temp_file_dir.name
+        if not os.path.exists(tmp_dir):
+            os.makedirs(tmp_dir)
+
+        self._train(tmp_dir)
+
+        results_files = os.listdir(tmp_dir)
+        json_files = glob.glob(os.path.join(tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+
+        temp_file_dir.cleanup()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 535acaef5bc01f8d6059e6acf364eaebb1147f7b Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Thu, 20 Oct 2022 12:13:19 +0800
Subject: [PATCH 075/129] [to #42322933]add test case to check xtcocotools
 availbility         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10462622

    * add test case to check xtcocotools availbility
---
 tests/utils/test_compatibility.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tests/utils/test_compatibility.py

diff --git a/tests/utils/test_compatibility.py b/tests/utils/test_compatibility.py
new file mode 100644
index 00000000..f5222261
--- /dev/null
+++ b/tests/utils/test_compatibility.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+
+class CompatibilityTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def tearDown(self):
+        super().tearDown()
+
+    def test_xtcocotools(self):
+        from xtcocotools.coco import COCO
+
+
+if __name__ == '__main__':
+    unittest.main()

From e7c7be6aae33537e25cf4187336a0c46479a75f3 Mon Sep 17 00:00:00 2001
From: "xingguang.zxg" <xingguang.zxg@alibaba-inc.com>
Date: Thu, 20 Oct 2022 12:51:48 +0800
Subject: [PATCH 076/129] fix cpu error         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10432300

    * fix cpu error
---
 modelscope/models/cv/text_driven_segmentation/lseg_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/models/cv/text_driven_segmentation/lseg_model.py b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
index 9a5754c6..ec381356 100644
--- a/modelscope/models/cv/text_driven_segmentation/lseg_model.py
+++ b/modelscope/models/cv/text_driven_segmentation/lseg_model.py
@@ -93,7 +93,7 @@ class TextDrivenSeg(TorchModel):
         """
         with torch.no_grad():
             if self.device_id == -1:
-                output = self.model(image)
+                output = self.model(image, [text])
             else:
                 device = torch.device('cuda', self.device_id)
                 output = self.model(image.to(device), [text])

From 1483c64638a002279443752adc195df6ef2e7494 Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Thu, 20 Oct 2022 12:54:37 +0800
Subject: [PATCH 077/129] [to #42322933] Fix ASR error when resample failed,
 and add all asr models UT, add apply-cmvn for pytorch models         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10465241

---
 data/test/audios/asr_example_8K.wav           |   3 +
 data/test/audios/asr_example_cn_dialect.wav   |   3 +
 data/test/audios/asr_example_cn_en.wav        |   3 +
 data/test/audios/asr_example_en.wav           |   3 +
 data/test/audios/asr_example_es.wav           |   3 +
 data/test/audios/asr_example_id.wav           |   3 +
 data/test/audios/asr_example_ja.wav           |   3 +
 data/test/audios/asr_example_ko.wav           |   3 +
 data/test/audios/asr_example_ru.wav           |   3 +
 .../pipelines/audio/asr_inference_pipeline.py |  19 +-
 .../pipelines/audio/kws_kwsbp_pipeline.py     |   4 +-
 modelscope/preprocessors/asr.py               |   6 +
 modelscope/utils/audio/audio_utils.py         |  13 +-
 .../test_automatic_speech_recognition.py      | 267 ++++++++++++++----
 14 files changed, 270 insertions(+), 66 deletions(-)
 create mode 100644 data/test/audios/asr_example_8K.wav
 create mode 100644 data/test/audios/asr_example_cn_dialect.wav
 create mode 100644 data/test/audios/asr_example_cn_en.wav
 create mode 100644 data/test/audios/asr_example_en.wav
 create mode 100644 data/test/audios/asr_example_es.wav
 create mode 100644 data/test/audios/asr_example_id.wav
 create mode 100644 data/test/audios/asr_example_ja.wav
 create mode 100644 data/test/audios/asr_example_ko.wav
 create mode 100644 data/test/audios/asr_example_ru.wav

diff --git a/data/test/audios/asr_example_8K.wav b/data/test/audios/asr_example_8K.wav
new file mode 100644
index 00000000..956aad27
--- /dev/null
+++ b/data/test/audios/asr_example_8K.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e999c247bfebb03d556a31722f0ce7145cac20a67fac9da813ad336e1f549f9f
+size 38954
diff --git a/data/test/audios/asr_example_cn_dialect.wav b/data/test/audios/asr_example_cn_dialect.wav
new file mode 100644
index 00000000..e18fb05d
--- /dev/null
+++ b/data/test/audios/asr_example_cn_dialect.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32eb8d4d537941bf0edea69cd6723e8ba489fa3df64e13e29f96e4fae0b856f4
+size 93676
diff --git a/data/test/audios/asr_example_cn_en.wav b/data/test/audios/asr_example_cn_en.wav
new file mode 100644
index 00000000..8baf3193
--- /dev/null
+++ b/data/test/audios/asr_example_cn_en.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f57aee13ade70be6b2c6e4f5e5c7404bdb03057b63828baefbaadcf23855a4cb
+size 472012
diff --git a/data/test/audios/asr_example_en.wav b/data/test/audios/asr_example_en.wav
new file mode 100644
index 00000000..fa996eec
--- /dev/null
+++ b/data/test/audios/asr_example_en.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fee8e0460ca707f108782be0d93c555bf34fb6b1cb297e5fceed70192cc65f9b
+size 71244
diff --git a/data/test/audios/asr_example_es.wav b/data/test/audios/asr_example_es.wav
new file mode 100644
index 00000000..95b22dc3
--- /dev/null
+++ b/data/test/audios/asr_example_es.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:450e31f9df8c5b48c617900625f01cb64c484f079a9843179fe9feaa7d163e61
+size 181964
diff --git a/data/test/audios/asr_example_id.wav b/data/test/audios/asr_example_id.wav
new file mode 100644
index 00000000..54c30614
--- /dev/null
+++ b/data/test/audios/asr_example_id.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:255494c41bc1dfb0c954d827ec6ce775900e4f7a55fb0a7881bdf9d66a03b425
+size 112078
diff --git a/data/test/audios/asr_example_ja.wav b/data/test/audios/asr_example_ja.wav
new file mode 100644
index 00000000..e953fee2
--- /dev/null
+++ b/data/test/audios/asr_example_ja.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22a55277908bbc3ef60a0cf56b230eb507b9e837574e8f493e93644b1d21c281
+size 200556
diff --git a/data/test/audios/asr_example_ko.wav b/data/test/audios/asr_example_ko.wav
new file mode 100644
index 00000000..0dad1be3
--- /dev/null
+++ b/data/test/audios/asr_example_ko.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee92191836c76412463d8b282a7ab4e1aa57386ba699ec011a3e2c4d64f32f4b
+size 162636
diff --git a/data/test/audios/asr_example_ru.wav b/data/test/audios/asr_example_ru.wav
new file mode 100644
index 00000000..b0cb8f2f
--- /dev/null
+++ b/data/test/audios/asr_example_ru.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77d1537fc584c1505d8aa10ec8c86af57ab661199e4f28fd7ffee3c22d1e4e61
+size 160204
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index 4e8b658d..6a4864bf 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -47,22 +47,28 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         if isinstance(audio_in, str):
             # load pcm data from url if audio_in is url str
-            self.audio_in = load_bytes_from_url(audio_in)
+            self.audio_in, checking_audio_fs = load_bytes_from_url(audio_in)
         elif isinstance(audio_in, bytes):
             # load pcm data from wav data if audio_in is wave format
-            self.audio_in = extract_pcm_from_wav(audio_in)
+            self.audio_in, checking_audio_fs = extract_pcm_from_wav(audio_in)
         else:
             self.audio_in = audio_in
 
+        # set the sample_rate of audio_in if checking_audio_fs is valid
+        if checking_audio_fs is not None:
+            self.audio_fs = checking_audio_fs
+
         if recog_type is None or audio_format is None:
             self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
                 audio_in=self.audio_in,
                 recog_type=recog_type,
                 audio_format=audio_format)
 
-        if hasattr(asr_utils, 'sample_rate_checking') and audio_fs is None:
-            self.audio_fs = asr_utils.sample_rate_checking(
+        if hasattr(asr_utils, 'sample_rate_checking'):
+            checking_audio_fs = asr_utils.sample_rate_checking(
                 self.audio_in, self.audio_format)
+            if checking_audio_fs is not None:
+                self.audio_fs = checking_audio_fs
 
         if self.preprocessor is None:
             self.preprocessor = WavToScp()
@@ -80,7 +86,7 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
 
         logger.info(f"Decoding with {inputs['audio_format']} files ...")
 
-        data_cmd: Sequence[Tuple[str, str]]
+        data_cmd: Sequence[Tuple[str, str, str]]
         if inputs['audio_format'] == 'wav' or inputs['audio_format'] == 'pcm':
             data_cmd = ['speech', 'sound']
         elif inputs['audio_format'] == 'kaldi_ark':
@@ -88,6 +94,9 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
         elif inputs['audio_format'] == 'tfrecord':
             data_cmd = ['speech', 'tfrecord']
 
+        if inputs.__contains__('mvn_file'):
+            data_cmd.append(inputs['mvn_file'])
+
         # generate asr inference command
         cmd = {
             'model_type': inputs['model_type'],
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index 5555c9e6..db6fc65d 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -51,10 +51,10 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
 
         if isinstance(audio_in, str):
             # load pcm data from url if audio_in is url str
-            audio_in = load_bytes_from_url(audio_in)
+            audio_in, audio_fs = load_bytes_from_url(audio_in)
         elif isinstance(audio_in, bytes):
             # load pcm data from wav data if audio_in is wave format
-            audio_in = extract_pcm_from_wav(audio_in)
+            audio_in, audio_fs = extract_pcm_from_wav(audio_in)
 
         output = self.preprocessor.forward(self.model.forward(), audio_in)
         output = self.forward(output)
diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py
index facaa132..91bf5860 100644
--- a/modelscope/preprocessors/asr.py
+++ b/modelscope/preprocessors/asr.py
@@ -133,6 +133,12 @@ class WavToScp(Preprocessor):
             else:
                 inputs['asr_model_config'] = asr_model_config
 
+            if inputs['model_config'].__contains__('mvn_file'):
+                mvn_file = os.path.join(inputs['model_workspace'],
+                                        inputs['model_config']['mvn_file'])
+                assert os.path.exists(mvn_file), 'mvn_file does not exist'
+                inputs['mvn_file'] = mvn_file
+
         elif inputs['model_type'] == Frameworks.tf:
             assert inputs['model_config'].__contains__(
                 'vocab_file'), 'vocab_file does not exist'
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 647d9521..32e2fa54 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -57,6 +57,7 @@ def update_conf(origin_config_file, new_config_file, conf_item: [str, str]):
 
 def extract_pcm_from_wav(wav: bytes) -> bytes:
     data = wav
+    sample_rate = None
     if len(data) > 44:
         frame_len = 44
         file_len = len(data)
@@ -70,29 +71,33 @@ def extract_pcm_from_wav(wav: bytes) -> bytes:
                         'Subchunk1ID'] == 'fmt ':
                 header_fields['SubChunk1Size'] = struct.unpack(
                     '<I', data[16:20])[0]
+                header_fields['SampleRate'] = struct.unpack('<I',
+                                                            data[24:28])[0]
+                sample_rate = header_fields['SampleRate']
 
                 if header_fields['SubChunk1Size'] == 16:
                     frame_len = 44
                 elif header_fields['SubChunk1Size'] == 18:
                     frame_len = 46
                 else:
-                    return data
+                    return data, sample_rate
 
                 data = wav[frame_len:file_len]
         except Exception:
             # no treatment
             pass
 
-    return data
+    return data, sample_rate
 
 
 def load_bytes_from_url(url: str) -> Union[bytes, str]:
+    sample_rate = None
     result = urlparse(url)
     if result.scheme is not None and len(result.scheme) > 0:
         storage = HTTPStorage()
         data = storage.read(url)
-        data = extract_pcm_from_wav(data)
+        data, sample_rate = extract_pcm_from_wav(data)
     else:
         data = url
 
-    return data
+    return data, sample_rate
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 303fb6b9..c37a6a3f 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -45,6 +45,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             'checking_item': OutputKeys.TEXT,
             'example': 'wav_example'
         },
+        'test_run_with_url_pytorch': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'wav_example'
+        },
         'test_run_with_url_tf': {
             'checking_item': OutputKeys.TEXT,
             'example': 'wav_example'
@@ -74,6 +78,170 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
         }
     }
 
+    all_models_info = [
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id': 'speech_paraformer_asr_nat-aishell1-pytorch',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1',
+            'wav_path': 'data/test/audios/asr_example_8K.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_8K.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_8K.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_cn_en.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_cn_en.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_cn_dialect.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_cn_dialect.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_8K.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_en.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_en.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_ru.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_ru.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_es.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_es.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_ko.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_ko.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_ja.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_ja.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_id.wav'
+        },
+        {
+            'model_group': 'damo',
+            'model_id':
+            'speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_id.wav'
+        },
+    ]
+
     def setUp(self) -> None:
         self.am_pytorch_model_id = 'damo/speech_paraformer_asr_nat-aishell1-pytorch'
         self.am_tf_model_id = 'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1'
@@ -90,7 +258,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
     def run_pipeline(self,
                      model_id: str,
                      audio_in: Union[str, bytes],
-                     sr: int = 16000) -> Dict[str, Any]:
+                     sr: int = None) -> Dict[str, Any]:
         inference_16k_pipline = pipeline(
             task=Tasks.auto_speech_recognition, model=model_id)
 
@@ -136,33 +304,26 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
         return audio, fs
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_wav_pytorch(self):
-        """run with single waveform file
+    def test_run_with_pcm(self):
+        """run with wav data
         """
 
-        logger.info('Run ASR test with waveform file (pytorch)...')
+        logger.info('Run ASR test with wav data (tensorflow)...')
 
-        wav_file_path = os.path.join(os.getcwd(), WAV_FILE)
+        audio, sr = self.wav2bytes(os.path.join(os.getcwd(), WAV_FILE))
 
         rec_result = self.run_pipeline(
-            model_id=self.am_pytorch_model_id, audio_in=wav_file_path)
-        self.check_result('test_run_with_wav_pytorch', rec_result)
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_pcm_pytorch(self):
-        """run with wav data
-        """
+            model_id=self.am_tf_model_id, audio_in=audio, sr=sr)
+        self.check_result('test_run_with_pcm_tf', rec_result)
 
         logger.info('Run ASR test with wav data (pytorch)...')
 
-        audio, sr = self.wav2bytes(os.path.join(os.getcwd(), WAV_FILE))
-
         rec_result = self.run_pipeline(
             model_id=self.am_pytorch_model_id, audio_in=audio, sr=sr)
         self.check_result('test_run_with_pcm_pytorch', rec_result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_wav_tf(self):
+    def test_run_with_wav(self):
         """run with single waveform file
         """
 
@@ -174,21 +335,14 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             model_id=self.am_tf_model_id, audio_in=wav_file_path)
         self.check_result('test_run_with_wav_tf', rec_result)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_pcm_tf(self):
-        """run with wav data
-        """
-
-        logger.info('Run ASR test with wav data (tensorflow)...')
-
-        audio, sr = self.wav2bytes(os.path.join(os.getcwd(), WAV_FILE))
+        logger.info('Run ASR test with waveform file (pytorch)...')
 
         rec_result = self.run_pipeline(
-            model_id=self.am_tf_model_id, audio_in=audio, sr=sr)
-        self.check_result('test_run_with_pcm_tf', rec_result)
+            model_id=self.am_pytorch_model_id, audio_in=wav_file_path)
+        self.check_result('test_run_with_wav_pytorch', rec_result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_url_tf(self):
+    def test_run_with_url(self):
         """run with single url file
         """
 
@@ -198,6 +352,12 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             model_id=self.am_tf_model_id, audio_in=URL_FILE)
         self.check_result('test_run_with_url_tf', rec_result)
 
+        logger.info('Run ASR test with url file (pytorch)...')
+
+        rec_result = self.run_pipeline(
+            model_id=self.am_pytorch_model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_url_pytorch', rec_result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_pytorch(self):
         """run with datasets, and audio format is waveform
@@ -217,7 +377,6 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
                  data.text  # hypothesis text
         """
 
-        logger.info('Run ASR test with waveform dataset (pytorch)...')
         logger.info('Downloading waveform testsets file ...')
 
         dataset_path = download_and_untar(
@@ -225,40 +384,38 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             LITTLE_TESTSETS_URL, self.workspace)
         dataset_path = os.path.join(dataset_path, 'wav', 'test')
 
+        logger.info('Run ASR test with waveform dataset (tensorflow)...')
+
+        rec_result = self.run_pipeline(
+            model_id=self.am_tf_model_id, audio_in=dataset_path)
+        self.check_result('test_run_with_wav_dataset_tf', rec_result)
+
+        logger.info('Run ASR test with waveform dataset (pytorch)...')
+
         rec_result = self.run_pipeline(
             model_id=self.am_pytorch_model_id, audio_in=dataset_path)
         self.check_result('test_run_with_wav_dataset_pytorch', rec_result)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_run_with_wav_dataset_tf(self):
-        """run with datasets, and audio format is waveform
-           datasets directory:
-             <dataset_path>
-               wav
-                 test   # testsets
-                   xx.wav
-                   ...
-                 dev    # devsets
-                   yy.wav
-                   ...
-                 train  # trainsets
-                   zz.wav
-                   ...
-               transcript
-                 data.text  # hypothesis text
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_all_models(self):
+        """run with all models
         """
 
-        logger.info('Run ASR test with waveform dataset (tensorflow)...')
-        logger.info('Downloading waveform testsets file ...')
-
-        dataset_path = download_and_untar(
-            os.path.join(self.workspace, LITTLE_TESTSETS_FILE),
-            LITTLE_TESTSETS_URL, self.workspace)
-        dataset_path = os.path.join(dataset_path, 'wav', 'test')
-
-        rec_result = self.run_pipeline(
-            model_id=self.am_tf_model_id, audio_in=dataset_path)
-        self.check_result('test_run_with_wav_dataset_tf', rec_result)
+        logger.info('Run ASR test with all models')
+
+        for item in self.all_models_info:
+            model_id = item['model_group'] + '/' + item['model_id']
+            wav_path = item['wav_path']
+            rec_result = self.run_pipeline(
+                model_id=model_id, audio_in=wav_path)
+            if rec_result.__contains__(OutputKeys.TEXT):
+                logger.info(ColorCodes.MAGENTA + str(item['model_id']) + ' '
+                            + ColorCodes.YELLOW
+                            + str(rec_result[OutputKeys.TEXT])
+                            + ColorCodes.END)
+            else:
+                logger.info(ColorCodes.MAGENTA + str(rec_result)
+                            + ColorCodes.END)
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):

From acba1786b0ac263140e2743b51fc7fc7c36d0980 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 20 Oct 2022 15:29:34 +0800
Subject: [PATCH 078/129] [to #42322933] Fix bug in UT daily

1. Fix bugs in daily test
2. Fix a bug that the updating of lr is before the first time of updating of optimizer
    TODO this will still cause warnings when GA is above 1
3. Remove the judgement of mode in text-classification's preprocessor to fit the base trainer(Bug)
     Update some regression bins to fit the preprocessor
4. Update the regression tool to let outer code modify atol and rtol
5. Add the default metric for text-classification task
6. Remove the useless ckpt conversion method in bert to avoid the requirement of tf when loading modeling_bert
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10430764
---
 data/test/regression/sbert-base-tnews.bin     |  3 +
 data/test/regression/sbert_nli.bin            |  4 +-
 data/test/regression/sbert_sen_sim.bin        |  4 +-
 ...rt_for_sequence_classification_exporter.py |  4 +-
 modelscope/metrics/builder.py                 |  1 +
 modelscope/models/nlp/bert/modeling_bert.py   | 78 ---------------
 modelscope/preprocessors/nlp/nlp_base.py      |  7 +-
 .../trainers/hooks/lr_scheduler_hook.py       |  2 +-
 modelscope/trainers/trainer.py                |  2 +-
 modelscope/utils/regress_test_utils.py        | 94 ++++++++++++-------
 tests/msdatasets/test_ms_dataset.py           |  3 +-
 .../test_finetune_sequence_classification.py  | 33 ++++++-
 tests/trainers/test_trainer_with_nlp.py       | 24 +++--
 13 files changed, 124 insertions(+), 135 deletions(-)
 create mode 100644 data/test/regression/sbert-base-tnews.bin

diff --git a/data/test/regression/sbert-base-tnews.bin b/data/test/regression/sbert-base-tnews.bin
new file mode 100644
index 00000000..1546860f
--- /dev/null
+++ b/data/test/regression/sbert-base-tnews.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bce1341f4b55d536771dad6e2b280458579f46c3216474ceb8a926022ab53d0
+size 151572
diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin
index a5f680bb..68efb778 100644
--- a/data/test/regression/sbert_nli.bin
+++ b/data/test/regression/sbert_nli.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44e3925c15d86d8596baeb6bd1d153d86f57b7489798b2cf988a1248e110fd62
-size 62231
+oid sha256:6af5024a26337a440c7ea2935fce84af558dd982ee97a2f027bb922cc874292b
+size 61741
diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin
index a59cbe0b..362f762c 100644
--- a/data/test/regression/sbert_sen_sim.bin
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ff17a0272752de4c88d4254b2e881f97f8ef022f03609d03ee1de0ae964368a
-size 62235
+oid sha256:bbce084781342ca7274c2e4d02ed5c5de43ba213a3b76328d5994404d6544c41
+size 61745
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
index dc1e2b92..52dab4bc 100644
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -23,12 +23,14 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
 
     def generate_dummy_inputs(self,
                               shape: Tuple = None,
+                              pair: bool = False,
                               **kwargs) -> Dict[str, Any]:
         """Generate dummy inputs for model exportation to onnx or other formats by tracing.
 
         @param shape: A tuple of input shape which should have at most two dimensions.
             shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
             shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
+        @param pair: Generate sentence pairs or single sentences for dummy inputs.
         @return: Dummy inputs.
         """
 
@@ -55,7 +57,7 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
             **sequence_length
         })
         preprocessor: Preprocessor = build_preprocessor(cfg, field_name)
-        if preprocessor.pair:
+        if pair:
             first_sequence = preprocessor.tokenizer.unk_token
             second_sequence = preprocessor.tokenizer.unk_token
         else:
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index ee4d2840..1c8e16d7 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -32,6 +32,7 @@ task_default_metrics = {
     Tasks.sentiment_classification: [Metrics.seq_cls_metric],
     Tasks.token_classification: [Metrics.token_cls_metric],
     Tasks.text_generation: [Metrics.text_gen_metric],
+    Tasks.text_classification: [Metrics.seq_cls_metric],
     Tasks.image_denoising: [Metrics.image_denoise_metric],
     Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
     Tasks.image_portrait_enhancement:
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
index e91a6433..7c1dfcf5 100755
--- a/modelscope/models/nlp/bert/modeling_bert.py
+++ b/modelscope/models/nlp/bert/modeling_bert.py
@@ -15,7 +15,6 @@
 """PyTorch BERT model. """
 
 import math
-import os
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -41,7 +40,6 @@ from transformers.modeling_utils import (PreTrainedModel,
                                          find_pruneable_heads_and_indices,
                                          prune_linear_layer)
 
-from modelscope.models.base import TorchModel
 from modelscope.utils.logger import get_logger
 from .configuration_bert import BertConfig
 
@@ -50,81 +48,6 @@ logger = get_logger(__name__)
 _CONFIG_FOR_DOC = 'BertConfig'
 
 
-def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
-            'https://www.tensorflow.org/install/ for installation instructions.'
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f'Converting TensorFlow checkpoint from {tf_path}')
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f'Loading TF weight {name} with shape {shape}')
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in [
-                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
-                'AdamWeightDecayOptimizer_1', 'global_step'
-        ] for n in name):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                scope_names = re.split(r'_(\d+)', m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == 'kernel' or scope_names[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif scope_names[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif scope_names[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            if pointer.shape != array.shape:
-                raise ValueError(
-                    f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
-                )
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f'Initialize PyTorch weight {name}')
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -750,7 +673,6 @@ class BertPreTrainedModel(PreTrainedModel):
     """
 
     config_class = BertConfig
-    load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = 'bert'
     supports_gradient_checkpointing = True
     _keys_to_ignore_on_load_missing = [r'position_ids']
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 267dbb8c..bc96f569 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -2,7 +2,7 @@
 
 import os.path as osp
 import re
-from typing import Any, Dict, Iterable, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import sentencepiece as spm
@@ -217,7 +217,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             return isinstance(label, str) or isinstance(label, int)
 
         if labels is not None:
-            if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
+            if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \
                     and self.label2id is not None:
                 output[OutputKeys.LABELS] = [
                     self.label2id[str(label)] for label in labels
@@ -314,8 +314,7 @@ class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
 
     def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
         kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
         kwargs['max_length'] = kwargs.pop('sequence_length', 128)
         super().__init__(model_dir, mode=mode, **kwargs)
 
diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py
index ca0ec01b..32fb0250 100644
--- a/modelscope/trainers/hooks/lr_scheduler_hook.py
+++ b/modelscope/trainers/hooks/lr_scheduler_hook.py
@@ -47,7 +47,7 @@ class LrSchedulerHook(Hook):
         return lr
 
     def before_train_iter(self, trainer):
-        if not self.by_epoch:
+        if not self.by_epoch and trainer.iter > 0:
             if self.warmup_lr_scheduler is not None:
                 self.warmup_lr_scheduler.step()
             else:
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 9eaff762..61d11aa6 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -651,7 +651,7 @@ class EpochBasedTrainer(BaseTrainer):
         # TODO: support MsDataset load for cv
         if hasattr(data_cfg, 'name'):
             dataset = MsDataset.load(
-                dataset_name=data_cfg.name,
+                dataset_name=data_cfg.pop('name'),
                 **data_cfg,
             )
             cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 47bbadfe..3c1e5c1c 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -65,7 +65,8 @@ class RegressTool:
     def monitor_module_single_forward(self,
                                       module: nn.Module,
                                       file_name: str,
-                                      compare_fn=None):
+                                      compare_fn=None,
+                                      **kwargs):
         """Monitor a pytorch module in a single forward.
 
         @param module: A torch module
@@ -107,7 +108,7 @@ class RegressTool:
             baseline = os.path.join(tempfile.gettempdir(), name)
             self.load(baseline, name)
             with open(baseline, 'rb') as f:
-                baseline_json = pickle.load(f)
+                base = pickle.load(f)
 
             class NumpyEncoder(json.JSONEncoder):
                 """Special json encoder for numpy types
@@ -122,9 +123,9 @@ class RegressTool:
                         return obj.tolist()
                     return json.JSONEncoder.default(self, obj)
 
-            print(f'baseline: {json.dumps(baseline_json, cls=NumpyEncoder)}')
+            print(f'baseline: {json.dumps(base, cls=NumpyEncoder)}')
             print(f'latest  : {json.dumps(io_json, cls=NumpyEncoder)}')
-            if not compare_io_and_print(baseline_json, io_json, compare_fn):
+            if not compare_io_and_print(base, io_json, compare_fn, **kwargs):
                 raise ValueError('Result not match!')
 
     @contextlib.contextmanager
@@ -136,7 +137,8 @@ class RegressTool:
                              ignore_keys=None,
                              compare_random=True,
                              reset_dropout=True,
-                             lazy_stop_callback=None):
+                             lazy_stop_callback=None,
+                             **kwargs):
         """Monitor a pytorch module's backward data and cfg data within a step of the optimizer.
 
         This is usually useful when you try to change some dangerous code
@@ -265,14 +267,15 @@ class RegressTool:
                 baseline_json = pickle.load(f)
 
             if level == 'strict' and not compare_io_and_print(
-                    baseline_json['forward'], io_json, compare_fn):
+                    baseline_json['forward'], io_json, compare_fn, **kwargs):
                 raise RuntimeError('Forward not match!')
             if not compare_backward_and_print(
                     baseline_json['backward'],
                     bw_json,
                     compare_fn=compare_fn,
                     ignore_keys=ignore_keys,
-                    level=level):
+                    level=level,
+                    **kwargs):
                 raise RuntimeError('Backward not match!')
             cfg_opt1 = {
                 'optimizer': baseline_json['optimizer'],
@@ -286,7 +289,8 @@ class RegressTool:
                 'cfg': summary['cfg'],
                 'state': None if not compare_random else summary['state']
             }
-            if not compare_cfg_and_optimizers(cfg_opt1, cfg_opt2, compare_fn):
+            if not compare_cfg_and_optimizers(cfg_opt1, cfg_opt2, compare_fn,
+                                              **kwargs):
                 raise RuntimeError('Cfg or optimizers not match!')
 
 
@@ -303,7 +307,8 @@ class MsRegressTool(RegressTool):
                          compare_fn=None,
                          ignore_keys=None,
                          compare_random=True,
-                         lazy_stop_callback=None):
+                         lazy_stop_callback=None,
+                         **kwargs):
 
         if lazy_stop_callback is None:
 
@@ -319,7 +324,7 @@ class MsRegressTool(RegressTool):
 
                 trainer.register_hook(EarlyStopHook())
 
-        def _train_loop(trainer, *args, **kwargs):
+        def _train_loop(trainer, *args_train, **kwargs_train):
             with self.monitor_module_train(
                     trainer,
                     file_name,
@@ -327,9 +332,11 @@ class MsRegressTool(RegressTool):
                     compare_fn=compare_fn,
                     ignore_keys=ignore_keys,
                     compare_random=compare_random,
-                    lazy_stop_callback=lazy_stop_callback):
+                    lazy_stop_callback=lazy_stop_callback,
+                    **kwargs):
                 try:
-                    return trainer.train_loop_origin(*args, **kwargs)
+                    return trainer.train_loop_origin(*args_train,
+                                                     **kwargs_train)
                 except MsRegressTool.EarlyStopError:
                     pass
 
@@ -530,7 +537,8 @@ def compare_arguments_nested(print_content,
                 )
             return False
         if not all([
-                compare_arguments_nested(None, sub_arg1, sub_arg2)
+                compare_arguments_nested(
+                    None, sub_arg1, sub_arg2, rtol=rtol, atol=atol)
                 for sub_arg1, sub_arg2 in zip(arg1, arg2)
         ]):
             if print_content is not None:
@@ -551,7 +559,8 @@ def compare_arguments_nested(print_content,
                 print(f'{print_content}, key diff:{set(keys1) - set(keys2)}')
             return False
         if not all([
-                compare_arguments_nested(None, arg1[key], arg2[key])
+                compare_arguments_nested(
+                    None, arg1[key], arg2[key], rtol=rtol, atol=atol)
                 for key in keys1
         ]):
             if print_content is not None:
@@ -574,7 +583,7 @@ def compare_arguments_nested(print_content,
         raise ValueError(f'type not supported: {type1}')
 
 
-def compare_io_and_print(baseline_json, io_json, compare_fn=None):
+def compare_io_and_print(baseline_json, io_json, compare_fn=None, **kwargs):
     if compare_fn is None:
 
         def compare_fn(*args, **kwargs):
@@ -602,10 +611,10 @@ def compare_io_and_print(baseline_json, io_json, compare_fn=None):
         else:
             match = compare_arguments_nested(
                 f'unmatched module {key} input args', v1input['args'],
-                v2input['args']) and match
+                v2input['args'], **kwargs) and match
             match = compare_arguments_nested(
                 f'unmatched module {key} input kwargs', v1input['kwargs'],
-                v2input['kwargs']) and match
+                v2input['kwargs'], **kwargs) and match
         v1output = numpify_tensor_nested(v1['output'])
         v2output = numpify_tensor_nested(v2['output'])
         res = compare_fn(v1output, v2output, key, 'output')
@@ -615,8 +624,11 @@ def compare_io_and_print(baseline_json, io_json, compare_fn=None):
             )
             match = match and res
         else:
-            match = compare_arguments_nested(f'unmatched module {key} outputs',
-                                             v1output, v2output) and match
+            match = compare_arguments_nested(
+                f'unmatched module {key} outputs',
+                arg1=v1output,
+                arg2=v2output,
+                **kwargs) and match
     return match
 
 
@@ -624,7 +636,8 @@ def compare_backward_and_print(baseline_json,
                                bw_json,
                                level,
                                ignore_keys=None,
-                               compare_fn=None):
+                               compare_fn=None,
+                               **kwargs):
     if compare_fn is None:
 
         def compare_fn(*args, **kwargs):
@@ -653,18 +666,26 @@ def compare_backward_and_print(baseline_json,
             data2, grad2, data_after2 = bw_json[key]['data'], bw_json[key][
                 'grad'], bw_json[key]['data_after']
             match = compare_arguments_nested(
-                f'unmatched module {key} tensor data', data1, data2) and match
+                f'unmatched module {key} tensor data',
+                arg1=data1,
+                arg2=data2,
+                **kwargs) and match
             if level == 'strict':
                 match = compare_arguments_nested(
-                    f'unmatched module {key} grad data', grad1,
-                    grad2) and match
+                    f'unmatched module {key} grad data',
+                    arg1=grad1,
+                    arg2=grad2,
+                    **kwargs) and match
                 match = compare_arguments_nested(
                     f'unmatched module {key} data after step', data_after1,
-                    data_after2) and match
+                    data_after2, **kwargs) and match
     return match
 
 
-def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
+def compare_cfg_and_optimizers(baseline_json,
+                               cfg_json,
+                               compare_fn=None,
+                               **kwargs):
     if compare_fn is None:
 
         def compare_fn(*args, **kwargs):
@@ -686,12 +707,12 @@ def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
             print(
                 f"Optimizer type not equal:{optimizer1['type']} and {optimizer2['type']}"
             )
-        match = compare_arguments_nested('unmatched optimizer defaults',
-                                         optimizer1['defaults'],
-                                         optimizer2['defaults']) and match
-        match = compare_arguments_nested('unmatched optimizer state_dict',
-                                         optimizer1['state_dict'],
-                                         optimizer2['state_dict']) and match
+        match = compare_arguments_nested(
+            'unmatched optimizer defaults', optimizer1['defaults'],
+            optimizer2['defaults'], **kwargs) and match
+        match = compare_arguments_nested(
+            'unmatched optimizer state_dict', optimizer1['state_dict'],
+            optimizer2['state_dict'], **kwargs) and match
 
     res = compare_fn(lr_scheduler1, lr_scheduler2, None, 'lr_scheduler')
     if res is not None:
@@ -703,16 +724,17 @@ def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
             print(
                 f"Optimizer type not equal:{lr_scheduler1['type']} and {lr_scheduler2['type']}"
             )
-        match = compare_arguments_nested('unmatched lr_scheduler state_dict',
-                                         lr_scheduler1['state_dict'],
-                                         lr_scheduler2['state_dict']) and match
+        match = compare_arguments_nested(
+            'unmatched lr_scheduler state_dict', lr_scheduler1['state_dict'],
+            lr_scheduler2['state_dict'], **kwargs) and match
 
     res = compare_fn(cfg1, cfg2, None, 'cfg')
     if res is not None:
         print(f'cfg compared with user compare_fn with result:{res}\n')
         match = match and res
     else:
-        match = compare_arguments_nested('unmatched cfg', cfg1, cfg2) and match
+        match = compare_arguments_nested(
+            'unmatched cfg', arg1=cfg1, arg2=cfg2, **kwargs) and match
 
     res = compare_fn(state1, state2, None, 'state')
     if res is not None:
@@ -721,6 +743,6 @@ def compare_cfg_and_optimizers(baseline_json, cfg_json, compare_fn=None):
         match = match and res
     else:
         match = compare_arguments_nested('unmatched random state', state1,
-                                         state2) and match
+                                         state2, **kwargs) and match
 
     return match
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 91a3b5c5..1e537e93 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -52,7 +52,8 @@ class MsDatasetTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ms_csv_basic(self):
         ms_ds_train = MsDataset.load(
-            'afqmc_small', namespace='userxiaoming', split='train')
+            'clue', subset_name='afqmc',
+            split='train').to_hf_dataset().select(range(5))
         print(next(iter(ms_ds_train)))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index f2adfa22..27db1f18 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -16,7 +16,8 @@ from modelscope.trainers.optimizer.child_tuning_adamw_optimizer import \
     calculate_fisher
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.data_utils import to_device
-from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.regress_test_utils import (MsRegressTool,
+                                                 compare_arguments_nested)
 from modelscope.utils.test_utils import test_level
 
 
@@ -41,6 +42,33 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
     def test_trainer_repeatable(self):
         import torch  # noqa
 
+        def compare_fn(value1, value2, key, type):
+            # Ignore the differences between optimizers of two torch versions
+            if type != 'optimizer':
+                return None
+
+            match = (value1['type'] == value2['type'])
+            shared_defaults = set(value1['defaults'].keys()).intersection(
+                set(value2['defaults'].keys()))
+            match = all([
+                compare_arguments_nested(f'Optimizer defaults {key} not match',
+                                         value1['defaults'][key],
+                                         value2['defaults'][key])
+                for key in shared_defaults
+            ]) and match
+            match = (len(value1['state_dict']['param_groups']) == len(
+                value2['state_dict']['param_groups'])) and match
+            for group1, group2 in zip(value1['state_dict']['param_groups'],
+                                      value2['state_dict']['param_groups']):
+                shared_keys = set(group1.keys()).intersection(
+                    set(group2.keys()))
+                match = all([
+                    compare_arguments_nested(
+                        f'Optimizer param_groups {key} not match', group1[key],
+                        group2[key]) for key in shared_keys
+                ]) and match
+            return match
+
         def cfg_modify_fn(cfg):
             cfg.task = 'nli'
             cfg['preprocessor'] = {'type': 'nli-tokenizer'}
@@ -98,7 +126,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             name=Trainers.nlp_base_trainer, default_args=kwargs)
 
         with self.regress_tool.monitor_ms_train(
-                trainer, 'sbert-base-tnews', level='strict'):
+                trainer, 'sbert-base-tnews', level='strict',
+                compare_fn=compare_fn):
             trainer.train()
 
     def finetune(self,
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 6030ada9..8357e778 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -29,7 +29,8 @@ class TestTrainerWithNlp(unittest.TestCase):
             os.makedirs(self.tmp_dir)
 
         self.dataset = MsDataset.load(
-            'afqmc_small', namespace='userxiaoming', split='train')
+            'clue', subset_name='afqmc',
+            split='train').to_hf_dataset().select(range(2))
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
@@ -73,7 +74,7 @@ class TestTrainerWithNlp(unittest.TestCase):
         output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
         pipeline_sentence_similarity(output_dir)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
     def test_trainer_with_backbone_head(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
         kwargs = dict(
@@ -99,6 +100,8 @@ class TestTrainerWithNlp(unittest.TestCase):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
         cfg = read_config(model_id, revision='beta')
         cfg.train.max_epochs = 20
+        cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
+        cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
         cfg.train.work_dir = self.tmp_dir
         cfg_file = os.path.join(self.tmp_dir, 'config.json')
         cfg.dump(cfg_file)
@@ -120,22 +123,24 @@ class TestTrainerWithNlp(unittest.TestCase):
             checkpoint_path=os.path.join(self.tmp_dir, 'epoch_10.pth'))
         self.assertTrue(Metrics.accuracy in eval_results)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_configured_datasets(self):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         cfg: Config = read_config(model_id)
         cfg.train.max_epochs = 20
+        cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
+        cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
         cfg.train.work_dir = self.tmp_dir
         cfg.dataset = {
             'train': {
-                'name': 'afqmc_small',
+                'name': 'clue',
+                'subset_name': 'afqmc',
                 'split': 'train',
-                'namespace': 'userxiaoming'
             },
             'val': {
-                'name': 'afqmc_small',
+                'name': 'clue',
+                'subset_name': 'afqmc',
                 'split': 'train',
-                'namespace': 'userxiaoming'
             },
         }
         cfg_file = os.path.join(self.tmp_dir, 'config.json')
@@ -159,6 +164,11 @@ class TestTrainerWithNlp(unittest.TestCase):
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
         cfg: Config = read_config(model_id)
         cfg.train.max_epochs = 3
+        cfg.preprocessor.first_sequence = 'sentence1'
+        cfg.preprocessor.second_sequence = 'sentence2'
+        cfg.preprocessor.label = 'label'
+        cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
+        cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
         cfg.train.work_dir = self.tmp_dir
         cfg_file = os.path.join(self.tmp_dir, 'config.json')
         cfg.dump(cfg_file)

From 8ec90ccbf8926235ee3176d4fd6b7f10dbdf09ad Mon Sep 17 00:00:00 2001
From: "xiangpeng.wxp" <xiangpeng.wxp@alibaba-inc.com>
Date: Thu, 20 Oct 2022 17:35:27 +0800
Subject: [PATCH 079/129] [to #42322933] Add uttest for en2fr and fr2en tasks  
       Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10467797

    * add uttest for en2fr and fr2en tasks
---
 tests/pipelines/test_csanmt_translation.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/pipelines/test_csanmt_translation.py b/tests/pipelines/test_csanmt_translation.py
index f7ec81cd..83827813 100644
--- a/tests/pipelines/test_csanmt_translation.py
+++ b/tests/pipelines/test_csanmt_translation.py
@@ -26,6 +26,20 @@ class TranslationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(self.task, model=model_id)
         print(pipeline_ins(input=inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_en2fr(self):
+        model_id = 'damo/nlp_csanmt_translation_en2fr'
+        inputs = 'When I was in my 20s, I saw my very first psychotherapy client.'
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_fr2en(self):
+        model_id = 'damo/nlp_csanmt_translation_fr2en'
+        inputs = "Quand j'avais la vingtaine, j'ai vu mes tout premiers clients comme psychothérapeute."
+        pipeline_ins = pipeline(self.task, model=model_id)
+        print(pipeline_ins(input=inputs))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         inputs = '声明补充说，沃伦的同事都深感震惊，并且希望他能够投案自首。'

From b2c5876eadc3af6fbb5049fbe907dd99cca08d2a Mon Sep 17 00:00:00 2001
From: "jiangnana.jnn" <jiangnana.jnn@alibaba-inc.com>
Date: Thu, 20 Oct 2022 19:31:53 +0800
Subject: [PATCH 080/129] [to #42322933]fix create dir when dist         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10459756

---
 modelscope/trainers/hooks/logger/text_logger_hook.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py
index 6629a0c9..8552ab4e 100644
--- a/modelscope/trainers/hooks/logger/text_logger_hook.py
+++ b/modelscope/trainers/hooks/logger/text_logger_hook.py
@@ -51,7 +51,7 @@ class TextLoggerHook(LoggerHook):
         if self.out_dir is None:
             self.out_dir = trainer.work_dir
 
-        if not osp.exists(self.out_dir):
+        if not osp.exists(self.out_dir) and is_master():
             os.makedirs(self.out_dir)
 
         trainer.logger.info('Text logs will be saved to {}'.format(

From de6d84cb9781181dc49b8162fb1f5a23fe4c8993 Mon Sep 17 00:00:00 2001
From: "hanyuan.chy" <hanyuan.chy@alibaba-inc.com>
Date: Thu, 20 Oct 2022 19:33:06 +0800
Subject: [PATCH 081/129] =?UTF-8?q?[to=20#42322933]=E4=BF=AE=E5=A4=8Dpipel?=
 =?UTF-8?q?ine=E4=B8=B2=E8=81=94=E6=97=B6collate=5Ffn=E5=BC=82=E5=B8=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复pipeline串联时collate_fn异常
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10457058
---
 modelscope/pipelines/base.py                  |  2 ++
 .../cv/body_3d_keypoints_pipeline.py          | 21 ++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index ea329be4..644749fc 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -433,6 +433,8 @@ def collate_fn(data, device):
     if isinstance(data, dict) or isinstance(data, Mapping):
         return type(data)({k: collate_fn(v, device) for k, v in data.items()})
     elif isinstance(data, (tuple, list)):
+        if 0 == len(data):
+            return torch.Tensor([])
         if isinstance(data[0], (int, float)):
             return default_collate(data).to(device)
         else:
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index 3502915c..8522ceff 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -143,6 +143,13 @@ class Body3DKeypointsPipeline(Pipeline):
         max_frame = self.keypoint_model_3d.cfg.model.INPUT.MAX_FRAME  # max video frame number to be predicted 3D joints
         for i, frame in enumerate(video_frames):
             kps_2d = self.human_body_2d_kps_detector(frame)
+            if [] == kps_2d.get('boxes'):
+                res = {
+                    'success': False,
+                    'msg': f'fail to detect person at image frame {i}'
+                }
+                return res
+
             box = kps_2d['boxes'][
                 0]  # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox
             pose = kps_2d['keypoints'][0]  # keypoints: [15, 2]
@@ -180,7 +187,15 @@ class Body3DKeypointsPipeline(Pipeline):
         return res
 
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        res = {OutputKeys.KEYPOINTS: [], OutputKeys.TIMESTAMPS: []}
+        output_video_path = kwargs.get('output_video', None)
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+
+        res = {
+            OutputKeys.KEYPOINTS: [],
+            OutputKeys.TIMESTAMPS: [],
+            OutputKeys.OUTPUT_VIDEO: output_video_path
+        }
 
         if not input['success']:
             pass
@@ -189,10 +204,6 @@ class Body3DKeypointsPipeline(Pipeline):
             pred_3d_pose = poses.data.cpu().numpy()[
                 0]  # [frame_num, joint_num, joint_dim]
 
-            output_video_path = kwargs.get('output_video', None)
-            if output_video_path is None:
-                output_video_path = tempfile.NamedTemporaryFile(
-                    suffix='.mp4').name
             if 'render' in self.keypoint_model_3d.cfg.keys():
                 self.render_prediction(pred_3d_pose, output_video_path)
                 res[OutputKeys.OUTPUT_VIDEO] = output_video_path

From 2b49b322a2b452b96413fe70c678c78be7b5b61a Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Thu, 20 Oct 2022 19:50:40 +0800
Subject: [PATCH 082/129] [to #42322933] Add palm ut
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

为以下三个模型补充 ut
damo/nlp_palm2.0_text-generation_chinese-large
damo/nlp_palm2.0_text-generation_commodity_chinese-base
damo/nlp_palm2.0_text-generation_weather_chinese-base
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10435599
---
 tests/pipelines/test_text_generation.py | 50 +++++++++++++++++++++----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index 5a270f83..4b0ebd47 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -15,12 +15,17 @@ from modelscope.utils.test_utils import test_level
 class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.palm_model_id_zh = 'damo/nlp_palm2.0_text-generation_chinese-base'
+        self.palm_model_id_zh_base = 'damo/nlp_palm2.0_text-generation_chinese-base'
+        self.palm_model_id_zh_large = 'damo/nlp_palm2.0_text-generation_chinese-large'
+        self.palm_model_id_zh_commodity = 'damo/nlp_palm2.0_text-generation_commodity_chinese-base'
+        self.palm_model_id_zh_weather = 'damo/nlp_palm2.0_text-generation_weather_chinese-base'
         self.palm_model_id_en = 'damo/nlp_palm2.0_text-generation_english-base'
         self.palm_input_zh = """
         本文总结了十个可穿戴产品的设计原则，而这些原则，同样也是笔者认为是这个行业最吸引人的地方：
         1.为人们解决重复性问题；2.从人开始，而不是从机器开始；3.要引起注意，但不要刻意；4.提升用户能力，而不是取代
         """
+        self.palm_input_commodity = '垃圾桶，双层，可拆卸，加高，加高双层，把手，垃圾桶，内附，万向轮'
+        self.palm_input_weather = "今日天气类型='浮尘'&空气质量等级='重度污染'&紫外线强度指数='中等'"
         self.palm_input_en = """
         The Director of Public Prosecutions who let off Lord Janner over alleged child sex abuse started
         her career at a legal chambers when the disgraced Labour peer was a top QC there . Alison Saunders ,
@@ -51,8 +56,8 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         print(pipeline_ins(input))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_palm_zh_with_model_name(self):
-        self.run_pipeline_with_model_id(self.palm_model_id_zh,
+    def test_palm_zh_base_with_model_name(self):
+        self.run_pipeline_with_model_id(self.palm_model_id_zh_base,
                                         self.palm_input_zh)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -71,10 +76,40 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
                                         self.gpt3_input)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_palm_zh_with_model_instance(self):
-        self.run_pipeline_with_model_instance(self.palm_model_id_zh,
+    def test_palm_zh_large_with_model_name(self):
+        self.run_pipeline_with_model_id(self.palm_model_id_zh_large,
+                                        self.palm_input_zh)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_palm_zh_commodity_with_model_name(self):
+        self.run_pipeline_with_model_id(self.palm_model_id_zh_commodity,
+                                        self.palm_input_commodity)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_palm_zh_weather_with_model_name(self):
+        self.run_pipeline_with_model_id(self.palm_model_id_zh_weather,
+                                        self.palm_input_weather)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_palm_zh_base_with_model_instance(self):
+        self.run_pipeline_with_model_instance(self.palm_model_id_zh_base,
                                               self.palm_input_zh)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_palm_zh_large_with_model_instance(self):
+        self.run_pipeline_with_model_instance(self.palm_model_id_zh_large,
+                                              self.palm_input_zh)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_palm_zh_commodity_with_model_instance(self):
+        self.run_pipeline_with_model_instance(self.palm_model_id_zh_commodity,
+                                              self.palm_input_commodity)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_palm_zh_weather_with_model_instance(self):
+        self.run_pipeline_with_model_instance(self.palm_model_id_zh_weather,
+                                              self.palm_input_weather)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_palm_en_with_model_instance(self):
         self.run_pipeline_with_model_instance(self.palm_model_id_en,
@@ -92,8 +127,9 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_palm(self):
-        for model_id, input in ((self.palm_model_id_zh, self.palm_input_zh),
-                                (self.palm_model_id_en, self.palm_input_en)):
+        for model_id, input in ((self.palm_model_id_zh_base,
+                                 self.palm_input_zh), (self.palm_model_id_en,
+                                                       self.palm_input_en)):
             cache_path = snapshot_download(model_id)
             model = PalmForTextGeneration.from_pretrained(cache_path)
             preprocessor = TextGenerationPreprocessor(

From 0bc9660fccdaaf45947d0f97c59bbeb10283f176 Mon Sep 17 00:00:00 2001
From: "lee.lcy" <lee.lcy@alibaba-inc.com>
Date: Thu, 20 Oct 2022 21:12:14 +0800
Subject: [PATCH 083/129] [to #42322933] modify img_embedding type for
 image_reid_person

modify img_embedding type for image_reid_person
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10477972
---
 modelscope/pipelines/cv/image_reid_person_pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modelscope/pipelines/cv/image_reid_person_pipeline.py b/modelscope/pipelines/cv/image_reid_person_pipeline.py
index 64674a65..9f60142a 100644
--- a/modelscope/pipelines/cv/image_reid_person_pipeline.py
+++ b/modelscope/pipelines/cv/image_reid_person_pipeline.py
@@ -53,6 +53,7 @@ class ImageReidPersonPipeline(Pipeline):
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         img = input['img']
         img_embedding = self.model(img)
+        img_embedding = img_embedding.detach().cpu().numpy()
         return {OutputKeys.IMG_EMBEDDING: img_embedding}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

From 9842a427ef793a8ff9e4e67a91bd74ec140b19f5 Mon Sep 17 00:00:00 2001
From: "chaojie.mcj" <chaojie.mcj@alibaba-inc.com>
Date: Thu, 20 Oct 2022 21:12:47 +0800
Subject: [PATCH 084/129] modify headers for __init__
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

更改部分__init__文件的header license
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10393596
---
 .../cv/image_to_image_generation/__init__.py  |  2 +-
 .../data/__init__.py                          |  2 +-
 .../data/transforms.py                        |  1 +
 .../models/__init__.py                        |  2 +-
 .../image_to_image_generation/ops/__init__.py |  2 +-
 .../cv/image_to_image_translation/__init__.py | 24 +++++++++++++++++++
 .../data/__init__.py                          |  1 +
 .../models/__init__.py                        |  1 +
 .../ops/__init__.py                           |  1 +
 .../product_retrieval_embedding/__init__.py   |  2 +-
 .../item_detection.py                         |  1 +
 .../item_embedding.py                         |  1 +
 .../product_retrieval_embedding/item_model.py |  2 ++
 .../models/multi_modal/diffusion/__init__.py  |  1 +
 .../models/multi_modal/gemm/__init__.py       |  1 +
 .../multi_stage_diffusion/__init__.py         |  1 +
 .../models/multi_modal/team/__init__.py       |  1 +
 17 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/modelscope/models/cv/image_to_image_generation/__init__.py b/modelscope/models/cv/image_to_image_generation/__init__.py
index fb408086..1af3e55f 100644
--- a/modelscope/models/cv/image_to_image_generation/__init__.py
+++ b/modelscope/models/cv/image_to_image_generation/__init__.py
@@ -1,2 +1,2 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from . import data, models, ops
diff --git a/modelscope/models/cv/image_to_image_generation/data/__init__.py b/modelscope/models/cv/image_to_image_generation/data/__init__.py
index 33c8cf44..22b9d22c 100644
--- a/modelscope/models/cv/image_to_image_generation/data/__init__.py
+++ b/modelscope/models/cv/image_to_image_generation/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/image_to_image_generation/data/transforms.py b/modelscope/models/cv/image_to_image_generation/data/transforms.py
index 5376d813..29a25b4b 100644
--- a/modelscope/models/cv/image_to_image_generation/data/transforms.py
+++ b/modelscope/models/cv/image_to_image_generation/data/transforms.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import math
 import random
 
diff --git a/modelscope/models/cv/image_to_image_generation/models/__init__.py b/modelscope/models/cv/image_to_image_generation/models/__init__.py
index ec6a46fd..e98421f2 100644
--- a/modelscope/models/cv/image_to_image_generation/models/__init__.py
+++ b/modelscope/models/cv/image_to_image_generation/models/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/image_to_image_generation/ops/__init__.py b/modelscope/models/cv/image_to_image_generation/ops/__init__.py
index 49674b49..e3dac584 100644
--- a/modelscope/models/cv/image_to_image_generation/ops/__init__.py
+++ b/modelscope/models/cv/image_to_image_generation/ops/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/image_to_image_translation/__init__.py b/modelscope/models/cv/image_to_image_translation/__init__.py
index e69de29b..35aab6be 100644
--- a/modelscope/models/cv/image_to_image_translation/__init__.py
+++ b/modelscope/models/cv/image_to_image_translation/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model_translation import UNet
+
+else:
+    _import_structure = {
+        'image_to_image_translation_model': ['UNet'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_to_image_translation/data/__init__.py b/modelscope/models/cv/image_to_image_translation/data/__init__.py
index 72450016..724bca04 100644
--- a/modelscope/models/cv/image_to_image_translation/data/__init__.py
+++ b/modelscope/models/cv/image_to_image_translation/data/__init__.py
@@ -1 +1,2 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .transforms import *  # noqa F403
diff --git a/modelscope/models/cv/image_to_image_translation/models/__init__.py b/modelscope/models/cv/image_to_image_translation/models/__init__.py
index 322d78f2..7fdd8189 100644
--- a/modelscope/models/cv/image_to_image_translation/models/__init__.py
+++ b/modelscope/models/cv/image_to_image_translation/models/__init__.py
@@ -1,2 +1,3 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .autoencoder import *  # noqa F403
 from .clip import *  # noqa F403
diff --git a/modelscope/models/cv/image_to_image_translation/ops/__init__.py b/modelscope/models/cv/image_to_image_translation/ops/__init__.py
index 59082d72..474c811b 100644
--- a/modelscope/models/cv/image_to_image_translation/ops/__init__.py
+++ b/modelscope/models/cv/image_to_image_translation/ops/__init__.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .degradation import *  # noqa F403
 from .diffusion import *  # noqa F403
 from .losses import *  # noqa F403
diff --git a/modelscope/models/cv/product_retrieval_embedding/__init__.py b/modelscope/models/cv/product_retrieval_embedding/__init__.py
index 7a02a60f..2cbc9099 100644
--- a/modelscope/models/cv/product_retrieval_embedding/__init__.py
+++ b/modelscope/models/cv/product_retrieval_embedding/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
diff --git a/modelscope/models/cv/product_retrieval_embedding/item_detection.py b/modelscope/models/cv/product_retrieval_embedding/item_detection.py
index d5589969..2002c6cb 100644
--- a/modelscope/models/cv/product_retrieval_embedding/item_detection.py
+++ b/modelscope/models/cv/product_retrieval_embedding/item_detection.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import cv2
 import numpy as np
 
diff --git a/modelscope/models/cv/product_retrieval_embedding/item_embedding.py b/modelscope/models/cv/product_retrieval_embedding/item_embedding.py
index 0444596c..ea9ec846 100644
--- a/modelscope/models/cv/product_retrieval_embedding/item_embedding.py
+++ b/modelscope/models/cv/product_retrieval_embedding/item_embedding.py
@@ -1,3 +1,4 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 import cv2
 import numpy as np
 import torch.nn as nn
diff --git a/modelscope/models/cv/product_retrieval_embedding/item_model.py b/modelscope/models/cv/product_retrieval_embedding/item_model.py
index 85a636c0..3964efbe 100644
--- a/modelscope/models/cv/product_retrieval_embedding/item_model.py
+++ b/modelscope/models/cv/product_retrieval_embedding/item_model.py
@@ -1,3 +1,5 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
 import os.path as osp
 from typing import Any, Dict
 
diff --git a/modelscope/models/multi_modal/diffusion/__init__.py b/modelscope/models/multi_modal/diffusion/__init__.py
index 28813cc9..e7e374b6 100644
--- a/modelscope/models/multi_modal/diffusion/__init__.py
+++ b/modelscope/models/multi_modal/diffusion/__init__.py
@@ -1 +1,2 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .model import DiffusionForTextToImageSynthesis
diff --git a/modelscope/models/multi_modal/gemm/__init__.py b/modelscope/models/multi_modal/gemm/__init__.py
index b920628e..fe5df1fe 100644
--- a/modelscope/models/multi_modal/gemm/__init__.py
+++ b/modelscope/models/multi_modal/gemm/__init__.py
@@ -1 +1,2 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .gemm_model import GEMMForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
index accbb56e..1b3f445b 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/__init__.py
@@ -1 +1,2 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .model import MultiStageDiffusionForTextToImageSynthesis
diff --git a/modelscope/models/multi_modal/team/__init__.py b/modelscope/models/multi_modal/team/__init__.py
index 0597040c..58bbdca5 100644
--- a/modelscope/models/multi_modal/team/__init__.py
+++ b/modelscope/models/multi_modal/team/__init__.py
@@ -1 +1,2 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from .team_model import TEAMForMultiModalSimilarity

From c582b19115189aa41fc755dd670b973267be67ac Mon Sep 17 00:00:00 2001
From: "james.wjg" <james.wjg@alibaba-inc.com>
Date: Thu, 20 Oct 2022 21:13:19 +0800
Subject: [PATCH 085/129] [to #42322933]cv/video_summarization fix a code bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复一行错误代码
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10477534

    * fix a code bug
---
 modelscope/models/cv/video_summarization/summarizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py
index 75251989..c9987670 100644
--- a/modelscope/models/cv/video_summarization/summarizer.py
+++ b/modelscope/models/cv/video_summarization/summarizer.py
@@ -161,7 +161,7 @@ def summary_format(summary, fps):
                 is_summary_frame = False
 
     if is_summary_frame and summary[-1] == 1:
-        end_frame = len(frame_idxes) - 1
+        end_frame = len(summary) - 1
         frames_list.append([start_frame, end_frame])
 
     output = []

From baff7c5b64ac3da953aac7b5877c2adf79dfa3ac Mon Sep 17 00:00:00 2001
From: "liugao.lg" <liugao.lg@alibaba-inc.com>
Date: Fri, 21 Oct 2022 09:11:15 +0800
Subject: [PATCH 086/129] [to #42322933]add ofa-ocr-recogniiton pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增ofa关于日常场景文字识别的任务，主要包括：
1、新增pipeline及task名称定义；
2、新增pipeline、task、model及prepreocess核心类方法的代码逻辑；
3、其它同步修正的小细节逻辑；
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10471089
---
 data/test/images/image_ocr_recognition.jpg    |  3 +
 modelscope/metainfo.py                        |  1 +
 .../models/multi_modal/ofa/utils/constant.py  |  1 +
 .../models/multi_modal/ofa_for_all_tasks.py   |  2 +
 modelscope/outputs.py                         |  1 +
 .../multi_modal/ocr_recognition_pipeline.py   | 52 ++++++++++
 modelscope/preprocessors/multi_modal.py       |  2 +
 modelscope/preprocessors/ofa/__init__.py      |  1 +
 .../preprocessors/ofa/ocr_recognition.py      | 99 +++++++++++++++++++
 modelscope/utils/constant.py                  |  1 +
 tests/pipelines/test_ofa_tasks.py             |  8 ++
 11 files changed, 171 insertions(+)
 create mode 100644 data/test/images/image_ocr_recognition.jpg
 create mode 100644 modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
 create mode 100644 modelscope/preprocessors/ofa/ocr_recognition.py

diff --git a/data/test/images/image_ocr_recognition.jpg b/data/test/images/image_ocr_recognition.jpg
new file mode 100644
index 00000000..b41287cd
--- /dev/null
+++ b/data/test/images/image_ocr_recognition.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:772b19f76c98044e39330853928624f10e085106a4292b4dd19f865531080747
+size 959
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 31bef3b8..732f8ffa 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -263,6 +263,7 @@ class Pipelines(object):
     text_to_image_synthesis = 'text-to-image-synthesis'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
     image_text_retrieval = 'image-text-retrieval'
+    ofa_ocr_recognition = 'ofa-ocr-recognition'
 
 
 class Trainers(object):
diff --git a/modelscope/models/multi_modal/ofa/utils/constant.py b/modelscope/models/multi_modal/ofa/utils/constant.py
index 984da443..eec2cc6c 100644
--- a/modelscope/models/multi_modal/ofa/utils/constant.py
+++ b/modelscope/models/multi_modal/ofa/utils/constant.py
@@ -3,6 +3,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
 OFA_TASK_KEY_MAPPING = {
+    Tasks.ofa_ocr_recognition: OutputKeys.TEXT,
     Tasks.image_captioning: OutputKeys.CAPTION,
     Tasks.summarization: OutputKeys.TEXT,
     Tasks.visual_question_answering: OutputKeys.TEXT,
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 45bafde9..20cab6a6 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -27,6 +27,7 @@ __all__ = ['OfaForAllTasks']
 
 
 @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
+@MODELS.register_module(Tasks.ofa_ocr_recognition, module_name=Models.ofa)
 @MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa)
 @MODELS.register_module(
     Tasks.visual_question_answering, module_name=Models.ofa)
@@ -96,6 +97,7 @@ class OfaForAllTasks(TorchModel):
             'traverse': self._traverse_inference,
         }
         self.task_inference_mapping = {
+            Tasks.ofa_ocr_recognition: self._text_gen_inference,
             Tasks.image_captioning: self._text_gen_inference,
             Tasks.summarization: self._text_gen_inference,
             Tasks.visual_grounding: self._visual_grounding_inference,
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index fbe15646..365e2bf9 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -661,6 +661,7 @@ TASK_OUTPUTS = {
     #   "caption": "this is an image caption text."
     # }
     Tasks.image_captioning: [OutputKeys.CAPTION],
+    Tasks.ofa_ocr_recognition: [OutputKeys.TEXT],
 
     # visual grounding result for single sample
     # {
diff --git a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
new file mode 100644
index 00000000..9cd63b6c
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.ofa_ocr_recognition, module_name=Pipelines.ofa_ocr_recognition)
+class OcrRecognitionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a ocr recognition pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForAllTasks'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        pipe_model.model.eval()
+        if preprocessor is None:
+            if isinstance(pipe_model, OfaForAllTasks):
+                preprocessor = OfaPreprocessor(pipe_model.model_dir)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index f38ff8ae..6f3245c3 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -34,6 +34,7 @@ class OfaPreprocessor(Preprocessor):
         """
         super().__init__(*args, **kwargs)
         preprocess_mapping = {
+            Tasks.ofa_ocr_recognition: OfaOcrRecognitionPreprocessor,
             Tasks.image_captioning: OfaImageCaptioningPreprocessor,
             Tasks.visual_grounding: OfaVisualGroundingPreprocessor,
             Tasks.visual_question_answering:
@@ -45,6 +46,7 @@ class OfaPreprocessor(Preprocessor):
             Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
         }
         input_key_mapping = {
+            Tasks.ofa_ocr_recognition: ['image'],
             Tasks.image_captioning: ['image'],
             Tasks.image_classification: ['image'],
             Tasks.summarization: ['text'],
diff --git a/modelscope/preprocessors/ofa/__init__.py b/modelscope/preprocessors/ofa/__init__.py
index 95d72fe1..59b94b2b 100644
--- a/modelscope/preprocessors/ofa/__init__.py
+++ b/modelscope/preprocessors/ofa/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from .image_captioning import OfaImageCaptioningPreprocessor
 from .image_classification import OfaImageClassificationPreprocessor
+from .ocr_recognition import OfaOcrRecognitionPreprocessor
 from .summarization import OfaSummarizationPreprocessor
 from .text_classification import OfaTextClassificationPreprocessor
 from .text_to_image_synthesis import OfaTextToImageSynthesisPreprocessor
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
new file mode 100644
index 00000000..1d30e572
--- /dev/null
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+import unicodedata
+from typing import Any, Dict, Union
+
+import torch
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms import functional as F
+
+from modelscope.preprocessors.image import load_image
+from .base import OfaBasePreprocessor
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def ocr_resize(img, patch_image_size, is_document=False):
+    img = img.convert('RGB')
+    width, height = img.size
+
+    if is_document:
+        new_height, new_width = 64, 1920
+    else:
+        if width >= height:
+            new_width = max(64, patch_image_size)
+            new_height = max(64, int(patch_image_size * (height / width)))
+            top = (patch_image_size - new_height) // 2
+            bottom = patch_image_size - new_height - top
+            left, right = 0, 0
+        else:
+            new_height = max(64, patch_image_size)
+            new_width = max(64, int(patch_image_size * (width / height)))
+            left = (patch_image_size - new_width) // 2
+            right = patch_image_size - new_width - left
+            top, bottom = 0, 0
+
+    img_new = F.resize(
+        img,
+        (new_height, new_width),
+        interpolation=InterpolationMode.BICUBIC,
+    )
+
+    if is_document:
+        img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1)
+        img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2))
+        new_width, new_height = img_new.size
+        top = (patch_image_size - new_height) // 2
+        bottom = patch_image_size - new_height - top
+        left, right = 0, 0
+
+    img_new = F.pad(
+        img_new, padding=[left, top, right, bottom], padding_mode='edge')
+    assert img_new.size == (patch_image_size, patch_image_size)
+
+    return img_new
+
+
+class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
+
+    def __init__(self, cfg, model_dir):
+        """preprocess the data
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path
+        """
+        super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir)
+        # Initialize transform
+        if self.cfg.model.imagenet_default_mean_and_std:
+            mean = IMAGENET_DEFAULT_MEAN
+            std = IMAGENET_DEFAULT_STD
+        else:
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: ocr_resize(
+                image,
+                self.cfg.model.patch_image_size,
+                is_document=self.cfg.model.is_document),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = data['image'] if isinstance(
+            data['image'], Image.Image) else load_image(data['image'])
+        patch_image = self.patch_resize_transform(image)
+        prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
+        inputs = self.get_inputs(prompt)
+
+        sample = {
+            'source': inputs,
+            'patch_image': patch_image,
+            'patch_mask': torch.tensor([True])
+        }
+        return sample
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 6c0f3e98..865e1d4f 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -151,6 +151,7 @@ class MultiModalTasks(object):
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
     image_text_retrieval = 'image-text-retrieval'
+    ofa_ocr_recognition = 'ofa-ocr-recognition'
 
 
 class TasksIODescriptions(object):
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index f8366508..05ecc719 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -45,6 +45,14 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = img_captioning('data/test/images/image_captioning.png')
         print(result[OutputKeys.CAPTION])
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_ocr_recognize_with_name(self):
+        ocr_recognize = pipeline(
+            Tasks.ofa_ocr_recognition,
+            model='damo/ofa_ocr-recognition_scene_base_zh')
+        result = ocr_recognize('data/test/images/image_ocr_recognition.jpg')
+        print(result[OutputKeys.TEXT])
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_classification_with_model(self):
         model = Model.from_pretrained(

From 533ab3df637b115394a9e0321c10a08978486c97 Mon Sep 17 00:00:00 2001
From: "baiguan.yt" <baiguan.yt@alibaba-inc.com>
Date: Fri, 21 Oct 2022 14:54:24 +0800
Subject: [PATCH 087/129] [to #42322933]update msdatasets for
 image-portrait-enhancement training         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10453584

---
 modelscope/metainfo.py                        |  1 +
 .../image_portrait_enhancement_metric.py      |  2 +
 .../image_portrait_enhancement.py             | 28 +++----
 .../msdatasets/task_datasets/__init__.py      |  2 +
 .../image_portrait_enhancement/__init__.py    | 23 ++++++
 .../image_portrait_enhancement/data_utils.py  | 32 ++++++++
 .../image_portrait_enhancement_dataset.py     | 51 +++++++++++++
 ...test_image_portrait_enhancement_trainer.py | 73 +++++++------------
 8 files changed, 150 insertions(+), 62 deletions(-)
 create mode 100644 modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py
 create mode 100644 modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 732f8ffa..fa1605de 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -459,3 +459,4 @@ class Datasets(object):
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
+    PairedDataset = 'PairedDataset'
diff --git a/modelscope/metrics/image_portrait_enhancement_metric.py b/modelscope/metrics/image_portrait_enhancement_metric.py
index 5a81e956..7d94aade 100644
--- a/modelscope/metrics/image_portrait_enhancement_metric.py
+++ b/modelscope/metrics/image_portrait_enhancement_metric.py
@@ -2,6 +2,7 @@
 # https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py
 from typing import Dict
 
+import cv2
 import numpy as np
 
 from modelscope.metainfo import Metrics
@@ -37,6 +38,7 @@ class ImagePortraitEnhancementMetric(Metric):
     def add(self, outputs: Dict, inputs: Dict):
         ground_truths = outputs['target']
         eval_results = outputs['pred']
+
         self.preds.extend(eval_results)
         self.targets.extend(ground_truths)
 
diff --git a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
index 3650ac7b..26e9e532 100644
--- a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
+++ b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
@@ -35,7 +35,7 @@ class ImagePortraitEnhancement(TorchModel):
         """
         super().__init__(model_dir, *args, **kwargs)
 
-        self.size = 512
+        self.size = 256
         self.style_dim = 512
         self.n_mlp = 8
         self.mean_path_length = 0
@@ -131,9 +131,9 @@ class ImagePortraitEnhancement(TorchModel):
         return path_penalty, path_mean.detach(), path_lengths
 
     @torch.no_grad()
-    def _evaluate_postprocess(self, src: Tensor,
+    def _evaluate_postprocess(self, input: Tensor,
                               target: Tensor) -> Dict[str, list]:
-        preds, _ = self.generator(src)
+        preds, _ = self.generator(input)
         preds = list(torch.split(preds, 1, 0))
         targets = list(torch.split(target, 1, 0))
 
@@ -144,11 +144,11 @@ class ImagePortraitEnhancement(TorchModel):
 
         return {'pred': preds, 'target': targets}
 
-    def _train_forward_d(self, src: Tensor, target: Tensor) -> Tensor:
+    def _train_forward_d(self, input: Tensor, target: Tensor) -> Tensor:
         self.requires_grad(self.generator, False)
         self.requires_grad(self.discriminator, True)
 
-        preds, _ = self.generator(src)
+        preds, _ = self.generator(input)
         fake_pred = self.discriminator(preds)
         real_pred = self.discriminator(target)
 
@@ -156,27 +156,27 @@ class ImagePortraitEnhancement(TorchModel):
 
         return d_loss
 
-    def _train_forward_d_r1(self, src: Tensor, target: Tensor) -> Tensor:
-        src.requires_grad = True
+    def _train_forward_d_r1(self, input: Tensor, target: Tensor) -> Tensor:
+        input.requires_grad = True
         target.requires_grad = True
         real_pred = self.discriminator(target)
         r1_loss = self.d_r1_loss(real_pred, target)
 
         return r1_loss
 
-    def _train_forward_g(self, src: Tensor, target: Tensor) -> Tensor:
+    def _train_forward_g(self, input: Tensor, target: Tensor) -> Tensor:
         self.requires_grad(self.generator, True)
         self.requires_grad(self.discriminator, False)
 
-        preds, _ = self.generator(src)
+        preds, _ = self.generator(input)
         fake_pred = self.discriminator(preds)
 
-        g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, src)
+        g_loss = self.g_nonsaturating_loss(fake_pred, preds, target, input)
 
         return g_loss
 
-    def _train_forward_g_path(self, src: Tensor, target: Tensor) -> Tensor:
-        fake_img, latents = self.generator(src, return_latents=True)
+    def _train_forward_g_path(self, input: Tensor, target: Tensor) -> Tensor:
+        fake_img, latents = self.generator(input, return_latents=True)
 
         path_loss, self.mean_path_length, path_lengths = self.g_path_regularize(
             fake_img, latents, self.mean_path_length)
@@ -184,8 +184,8 @@ class ImagePortraitEnhancement(TorchModel):
         return path_loss
 
     @torch.no_grad()
-    def _inference_forward(self, src: Tensor) -> Dict[str, Tensor]:
-        return {'outputs': (self.generator(src)[0] * 0.5 + 0.5).clamp(0, 1)}
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        return {'outputs': (self.generator(input)[0] * 0.5 + 0.5).clamp(0, 1)}
 
     def forward(self, input: Dict[str,
                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 7c31969a..914c41bf 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -27,6 +27,8 @@ else:
         'movie_scene_segmentation': ['MovieSceneSegmentationDataset'],
         'image_inpainting': ['ImageInpaintingDataset'],
         'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
+        'image_portrait_enhancement_dataset':
+        ['ImagePortraitEnhancementDataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py
new file mode 100644
index 00000000..4df24fae
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_portrait_enhancement_dataset import ImagePortraitEnhancementDataset
+
+else:
+    _import_structure = {
+        'image_portrait_enhancement_dataset':
+        ['ImagePortraitEnhancementDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py
new file mode 100644
index 00000000..1133d3c2
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/data_utils.py
@@ -0,0 +1,32 @@
+# ------------------------------------------------------------------------
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+# ------------------------------------------------------------------------
+
+import cv2
+import torch
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    else:
+        return _totensor(imgs, bgr2rgb, float32)
diff --git a/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py
new file mode 100644
index 00000000..58d40778
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/image_portrait_enhancement/image_portrait_enhancement_dataset.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Datasets, Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from .data_utils import img2tensor
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_COLOR).astype(np.float32) / 255.0
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_portrait_enhancement, module_name=Datasets.PairedDataset)
+class ImagePortraitEnhancementDataset(TorchTaskDataset):
+    """Paired image dataset for image portrait enhancement.
+    """
+
+    def __init__(self, dataset, is_train):
+        self.dataset = dataset
+        self.gt_size = 256
+        self.is_train = is_train
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load gt and lq images. Dimension order: HWC; channel order: BGR;
+        # image range: [0, 1], float32.
+        item_dict = self.dataset[index]
+        gt_path = item_dict['hq:FILE']
+        img_gt = default_loader(gt_path)
+        lq_path = item_dict['lq:FILE']
+        img_lq = default_loader(lq_path)
+
+        gt_size = self.gt_size
+        img_gt = cv2.resize(img_gt, (gt_size, gt_size))
+        img_lq = cv2.resize(img_lq, (gt_size, gt_size))
+
+        # BGR to RGB, HWC to CHW, numpy to tensor
+        img_gt, img_lq = img2tensor([img_gt, img_lq],
+                                    bgr2rgb=True,
+                                    float32=True)
+
+        return {'input': (img_lq - 0.5) / 0.5, 'target': (img_gt - 0.5) / 0.5}
diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py
index 049adf7e..5c47a59b 100644
--- a/tests/trainers/test_image_portrait_enhancement_trainer.py
+++ b/tests/trainers/test_image_portrait_enhancement_trainer.py
@@ -14,52 +14,14 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
 from modelscope.models.cv.image_portrait_enhancement import \
     ImagePortraitEnhancement
+from modelscope.msdatasets import MsDataset
+from modelscope.msdatasets.task_datasets.image_portrait_enhancement import \
+    ImagePortraitEnhancementDataset
 from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
+from modelscope.utils.constant import DownloadMode, ModelFile
 from modelscope.utils.test_utils import test_level
 
 
-class PairedImageDataset(data.Dataset):
-
-    def __init__(self, root, size=512):
-        super(PairedImageDataset, self).__init__()
-        self.size = size
-        gt_dir = osp.join(root, 'gt')
-        lq_dir = osp.join(root, 'lq')
-        self.gt_filelist = os.listdir(gt_dir)
-        self.gt_filelist = sorted(self.gt_filelist, key=lambda x: int(x[:-4]))
-        self.gt_filelist = [osp.join(gt_dir, f) for f in self.gt_filelist]
-        self.lq_filelist = os.listdir(lq_dir)
-        self.lq_filelist = sorted(self.lq_filelist, key=lambda x: int(x[:-4]))
-        self.lq_filelist = [osp.join(lq_dir, f) for f in self.lq_filelist]
-
-    def _img_to_tensor(self, img):
-        img = torch.from_numpy(img[:, :, [2, 1, 0]]).permute(2, 0, 1).type(
-            torch.float32) / 255.
-        return (img - 0.5) / 0.5
-
-    def __getitem__(self, index):
-        lq = cv2.imread(self.lq_filelist[index])
-        gt = cv2.imread(self.gt_filelist[index])
-        lq = cv2.resize(
-            lq, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-        gt = cv2.resize(
-            gt, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
-
-        return \
-            {'src': self._img_to_tensor(lq), 'target': self._img_to_tensor(gt)}
-
-    def __len__(self):
-        return len(self.gt_filelist)
-
-    def to_torch_dataset(self,
-                         columns: Union[str, List[str]] = None,
-                         preprocessors: Union[Callable, List[Callable]] = None,
-                         **format_kwargs):
-        # self.preprocessor = preprocessors
-        return self
-
-
 class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
     def setUp(self):
@@ -70,8 +32,23 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
 
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
 
-        self.dataset = PairedImageDataset(
-            './data/test/images/face_enhancement/')
+        dataset_train = MsDataset.load(
+            'image-portrait-enhancement-dataset',
+            namespace='modelscope',
+            subset_name='default',
+            split='test',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        dataset_val = MsDataset.load(
+            'image-portrait-enhancement-dataset',
+            namespace='modelscope',
+            subset_name='default',
+            split='test',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+
+        self.dataset_train = ImagePortraitEnhancementDataset(
+            dataset_train, is_train=True)
+        self.dataset_val = ImagePortraitEnhancementDataset(
+            dataset_val, is_train=False)
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
@@ -81,8 +58,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
+            train_dataset=self.dataset_train,
+            eval_dataset=self.dataset_val,
             device='gpu',
             work_dir=self.tmp_dir)
 
@@ -101,8 +78,8 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
             model=model,
-            train_dataset=self.dataset,
-            eval_dataset=self.dataset,
+            train_dataset=self.dataset_train,
+            eval_dataset=self.dataset_val,
             device='gpu',
             max_epochs=2,
             work_dir=self.tmp_dir)

From f7f7eb21dced72762ed26a74bf7baa2be58822c6 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 21 Oct 2022 22:10:40 +0800
Subject: [PATCH 088/129] [to #42322933]Fix the logic of fast tokenizer

1. Change the logic of using fast tokenizer from mode to user arguments and tokenizer_config.json
This is to fix the problem of RANER must use fast tokenizer in some special models.
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10488982
---
 modelscope/preprocessors/nlp/nlp_base.py | 47 +++++++++++-------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index bc96f569..9049ec99 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -1,9 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 import os.path as osp
 import re
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
+import json
 import numpy as np
 import sentencepiece as spm
 import torch
@@ -13,8 +14,7 @@ from modelscope.metainfo import Models, Preprocessors
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.config import (Config, ConfigFields,
-                                     use_task_specific_params)
+from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
@@ -83,6 +83,15 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
 
         self._mode = mode
         self.label = kwargs.pop('label', OutputKeys.LABEL)
+        self.use_fast = kwargs.pop('use_fast', None)
+        if self.use_fast is None and os.path.isfile(
+                os.path.join(model_dir, 'tokenizer_config.json')):
+            with open(os.path.join(model_dir, 'tokenizer_config.json'),
+                      'r') as f:
+                json_config = json.load(f)
+                self.use_fast = json_config.get('use_fast')
+        self.use_fast = False if self.use_fast is None else self.use_fast
+
         self.label2id = None
         if 'label2id' in kwargs:
             self.label2id = kwargs.pop('label2id')
@@ -118,32 +127,23 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         if model_type in (Models.structbert, Models.gpt3, Models.palm,
                           Models.plug):
             from modelscope.models.nlp.structbert import SbertTokenizer, SbertTokenizerFast
-            return SbertTokenizer.from_pretrained(
-                model_dir
-            ) if self._mode == ModeKeys.INFERENCE else SbertTokenizerFast.from_pretrained(
-                model_dir)
+            tokenizer = SbertTokenizerFast if self.use_fast else SbertTokenizer
+            return tokenizer.from_pretrained(model_dir)
         elif model_type == Models.veco:
             from modelscope.models.nlp.veco import VecoTokenizer, VecoTokenizerFast
-            return VecoTokenizer.from_pretrained(
-                model_dir
-            ) if self._mode == ModeKeys.INFERENCE else VecoTokenizerFast.from_pretrained(
-                model_dir)
+            tokenizer = VecoTokenizerFast if self.use_fast else VecoTokenizer
+            return tokenizer.from_pretrained(model_dir)
         elif model_type == Models.deberta_v2:
             from modelscope.models.nlp.deberta_v2 import DebertaV2Tokenizer, DebertaV2TokenizerFast
-            return DebertaV2Tokenizer.from_pretrained(
-                model_dir
-            ) if self._mode == ModeKeys.INFERENCE else DebertaV2TokenizerFast.from_pretrained(
-                model_dir)
+            tokenizer = DebertaV2TokenizerFast if self.use_fast else DebertaV2Tokenizer
+            return tokenizer.from_pretrained(model_dir)
         elif not self.is_transformer_based_model:
             from transformers import BertTokenizer, BertTokenizerFast
-            return BertTokenizer.from_pretrained(
-                model_dir
-            ) if self._mode == ModeKeys.INFERENCE else BertTokenizerFast.from_pretrained(
-                model_dir)
+            tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer
+            return tokenizer.from_pretrained(model_dir)
         else:
             return AutoTokenizer.from_pretrained(
-                model_dir,
-                use_fast=False if self._mode == ModeKeys.INFERENCE else True)
+                model_dir, use_fast=self.use_fast)
 
     def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
         """process the raw input data
@@ -593,9 +593,6 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
         else:
             self.is_split_into_words = self.tokenizer.init_kwargs.get(
                 'is_split_into_words', False)
-        if 'label2id' in kwargs:
-            kwargs.pop('label2id')
-        self.tokenize_kwargs = kwargs
 
     @type_assert(object, str)
     def __call__(self, data: str) -> Dict[str, Any]:

From dee93c40e28471c311c1c921debf754de7b691d1 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sat, 22 Oct 2022 16:28:30 +0800
Subject: [PATCH 089/129]  [to #42322933] force download dataset for portraint
 enhancement

---
 tests/trainers/test_image_portrait_enhancement_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainers/test_image_portrait_enhancement_trainer.py b/tests/trainers/test_image_portrait_enhancement_trainer.py
index 5c47a59b..123e0098 100644
--- a/tests/trainers/test_image_portrait_enhancement_trainer.py
+++ b/tests/trainers/test_image_portrait_enhancement_trainer.py
@@ -37,13 +37,13 @@ class TestImagePortraitEnhancementTrainer(unittest.TestCase):
             namespace='modelscope',
             subset_name='default',
             split='test',
-            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)._hf_ds
         dataset_val = MsDataset.load(
             'image-portrait-enhancement-dataset',
             namespace='modelscope',
             subset_name='default',
             split='test',
-            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)._hf_ds
 
         self.dataset_train = ImagePortraitEnhancementDataset(
             dataset_train, is_train=True)

From 9bc06716c13bfad650b9ec3cc402f0efb58465c0 Mon Sep 17 00:00:00 2001
From: Yingda Chen <yingda.chen@alibaba-inc.com>
Date: Sat, 22 Oct 2022 16:30:19 +0800
Subject: [PATCH 090/129]  [to #42322933] fix typo

---
 modelscope/utils/registry.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 73e94b3c..d6994bd3 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -196,8 +196,7 @@ def build_from_cfg(cfg,
             raise KeyError(
                 f'{obj_type} is not in the {registry.name}'
                 f' registry group {group_key}. Please make'
-                f' sure the correct version of 1qqQModelScope library is used.'
-            )
+                f' sure the correct version of ModelScope library is used.')
         obj_cls.group_key = group_key
     elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
         obj_cls = obj_type

From 683ee5bfed89f5213b0d770cf7a18fefc666f552 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Sat, 22 Oct 2022 17:01:03 +0800
Subject: [PATCH 091/129] [to #42322933]use Tasks.ocr_recognition         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10490937

---
 modelscope/models/multi_modal/ofa/utils/constant.py       | 4 ++--
 modelscope/models/multi_modal/ofa_for_all_tasks.py        | 8 ++++----
 modelscope/outputs.py                                     | 2 +-
 .../multi_modal/multi_modal_embedding_pipeline.py         | 2 ++
 .../pipelines/multi_modal/ocr_recognition_pipeline.py     | 2 +-
 modelscope/pipelines/nlp/summarization_pipeline.py        | 2 +-
 modelscope/preprocessors/multi_modal.py                   | 8 ++++----
 modelscope/utils/constant.py                              | 3 +--
 tests/pipelines/test_ofa_tasks.py                         | 6 +++---
 9 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/modelscope/models/multi_modal/ofa/utils/constant.py b/modelscope/models/multi_modal/ofa/utils/constant.py
index eec2cc6c..d3257383 100644
--- a/modelscope/models/multi_modal/ofa/utils/constant.py
+++ b/modelscope/models/multi_modal/ofa/utils/constant.py
@@ -3,9 +3,9 @@ from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
 OFA_TASK_KEY_MAPPING = {
-    Tasks.ofa_ocr_recognition: OutputKeys.TEXT,
+    Tasks.ocr_recognition: OutputKeys.TEXT,
     Tasks.image_captioning: OutputKeys.CAPTION,
-    Tasks.summarization: OutputKeys.TEXT,
+    Tasks.text_summarization: OutputKeys.TEXT,
     Tasks.visual_question_answering: OutputKeys.TEXT,
     Tasks.visual_grounding: OutputKeys.BOXES,
     Tasks.text_classification: (OutputKeys.SCORES, OutputKeys.LABELS),
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 20cab6a6..6e331228 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -27,13 +27,13 @@ __all__ = ['OfaForAllTasks']
 
 
 @MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
-@MODELS.register_module(Tasks.ofa_ocr_recognition, module_name=Models.ofa)
+@MODELS.register_module(Tasks.ocr_recognition, module_name=Models.ofa)
 @MODELS.register_module(Tasks.visual_grounding, module_name=Models.ofa)
 @MODELS.register_module(
     Tasks.visual_question_answering, module_name=Models.ofa)
 @MODELS.register_module(Tasks.visual_entailment, module_name=Models.ofa)
 @MODELS.register_module(Tasks.image_classification, module_name=Models.ofa)
-@MODELS.register_module(Tasks.summarization, module_name=Models.ofa)
+@MODELS.register_module(Tasks.text_summarization, module_name=Models.ofa)
 @MODELS.register_module(Tasks.text_classification, module_name=Models.ofa)
 class OfaForAllTasks(TorchModel):
 
@@ -97,9 +97,9 @@ class OfaForAllTasks(TorchModel):
             'traverse': self._traverse_inference,
         }
         self.task_inference_mapping = {
-            Tasks.ofa_ocr_recognition: self._text_gen_inference,
+            Tasks.ocr_recognition: self._text_gen_inference,
             Tasks.image_captioning: self._text_gen_inference,
-            Tasks.summarization: self._text_gen_inference,
+            Tasks.text_summarization: self._text_gen_inference,
             Tasks.visual_grounding: self._visual_grounding_inference,
             Tasks.visual_entailment: inference_d[self.gen_type],
             Tasks.visual_question_answering: inference_d[self.gen_type],
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 365e2bf9..af37eb84 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -661,7 +661,7 @@ TASK_OUTPUTS = {
     #   "caption": "this is an image caption text."
     # }
     Tasks.image_captioning: [OutputKeys.CAPTION],
-    Tasks.ofa_ocr_recognition: [OutputKeys.TEXT],
+    Tasks.ocr_recognition: [OutputKeys.TEXT],
 
     # visual grounding result for single sample
     # {
diff --git a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
index 76011be0..d3f15c23 100644
--- a/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/multi_modal_embedding_pipeline.py
@@ -11,6 +11,8 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
+@PIPELINES.register_module(
+    Tasks.image_text_retrieval, module_name=Pipelines.multi_modal_embedding)
 @PIPELINES.register_module(
     Tasks.multi_modal_embedding, module_name=Pipelines.multi_modal_embedding)
 class MultiModalEmbeddingPipeline(Pipeline):
diff --git a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
index 9cd63b6c..c61b38f3 100644
--- a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
@@ -16,7 +16,7 @@ logger = get_logger()
 
 
 @PIPELINES.register_module(
-    Tasks.ofa_ocr_recognition, module_name=Pipelines.ofa_ocr_recognition)
+    Tasks.ocr_recognition, module_name=Pipelines.ofa_ocr_recognition)
 class OcrRecognitionPipeline(Pipeline):
 
     def __init__(self,
diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py
index 7a91eff1..30dd4b30 100644
--- a/modelscope/pipelines/nlp/summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/summarization_pipeline.py
@@ -13,7 +13,7 @@ logger = get_logger()
 
 
 @PIPELINES.register_module(
-    Tasks.summarization, module_name=Pipelines.text_generation)
+    Tasks.text_summarization, module_name=Pipelines.text_generation)
 class SummarizationPipeline(Pipeline):
 
     def __init__(self,
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 6f3245c3..4427c096 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -34,7 +34,7 @@ class OfaPreprocessor(Preprocessor):
         """
         super().__init__(*args, **kwargs)
         preprocess_mapping = {
-            Tasks.ofa_ocr_recognition: OfaOcrRecognitionPreprocessor,
+            Tasks.ocr_recognition: OfaOcrRecognitionPreprocessor,
             Tasks.image_captioning: OfaImageCaptioningPreprocessor,
             Tasks.visual_grounding: OfaVisualGroundingPreprocessor,
             Tasks.visual_question_answering:
@@ -42,14 +42,14 @@ class OfaPreprocessor(Preprocessor):
             Tasks.visual_entailment: OfaVisualEntailmentPreprocessor,
             Tasks.image_classification: OfaImageClassificationPreprocessor,
             Tasks.text_classification: OfaTextClassificationPreprocessor,
-            Tasks.summarization: OfaSummarizationPreprocessor,
+            Tasks.text_summarization: OfaSummarizationPreprocessor,
             Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
         }
         input_key_mapping = {
-            Tasks.ofa_ocr_recognition: ['image'],
+            Tasks.ocr_recognition: ['image'],
             Tasks.image_captioning: ['image'],
             Tasks.image_classification: ['image'],
-            Tasks.summarization: ['text'],
+            Tasks.text_summarization: ['text'],
             Tasks.text_classification: ['text', 'text2'],
             Tasks.visual_grounding: ['image', 'text'],
             Tasks.visual_question_answering: ['image', 'text'],
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 865e1d4f..8e986b61 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -117,7 +117,7 @@ class NLPTasks(object):
     table_question_answering = 'table-question-answering'
     sentence_embedding = 'sentence-embedding'
     fill_mask = 'fill-mask'
-    summarization = 'summarization'
+    text_summarization = 'text-summarization'
     question_answering = 'question-answering'
     zero_shot_classification = 'zero-shot-classification'
     backbone = 'backbone'
@@ -151,7 +151,6 @@ class MultiModalTasks(object):
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
     image_text_retrieval = 'image-text-retrieval'
-    ofa_ocr_recognition = 'ofa-ocr-recognition'
 
 
 class TasksIODescriptions(object):
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 05ecc719..57dcb0c3 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -48,7 +48,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_ocr_recognize_with_name(self):
         ocr_recognize = pipeline(
-            Tasks.ofa_ocr_recognition,
+            Tasks.ocr_recognition,
             model='damo/ofa_ocr-recognition_scene_base_zh')
         result = ocr_recognize('data/test/images/image_ocr_recognition.jpg')
         print(result[OutputKeys.TEXT])
@@ -75,7 +75,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_with_summarization_with_model(self):
         model = Model.from_pretrained(
             'damo/ofa_summarization_gigaword_large_en')
-        ofa_pipe = pipeline(Tasks.summarization, model=model)
+        ofa_pipe = pipeline(Tasks.text_summarization, model=model)
         text = 'five-time world champion michelle kwan withdrew' + \
                'from the #### us figure skating championships on wednesday ,' + \
                ' but will petition us skating officials for the chance to ' + \
@@ -87,7 +87,7 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_summarization_with_name(self):
         ofa_pipe = pipeline(
-            Tasks.summarization,
+            Tasks.text_summarization,
             model='damo/ofa_summarization_gigaword_large_en')
         text = 'five-time world champion michelle kwan withdrew' + \
                'from the #### us figure skating championships on wednesday ,' + \

From 824ee8232cdcd56d2e137eaa5de2da343b2839eb Mon Sep 17 00:00:00 2001
From: "zhangyanzhao.zyz" <zhangyanzhao.zyz@alibaba-inc.com>
Date: Sat, 22 Oct 2022 17:12:48 +0800
Subject: [PATCH 092/129] =?UTF-8?q?[to=20#42322933]=E6=9B=B4=E6=96=B0?=
 =?UTF-8?q?=E8=AF=AD=E4=B9=89=E7=9B=B8=E5=85=B3=E6=80=A7=E4=BB=BB=E5=8A=A1?=
 =?UTF-8?q?=E8=8B=B1=E6=96=87=E5=90=8D=E7=A7=B0=E4=B8=BAtext=20ranking?=
 =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=94=B9=E5=AF=B9=E5=BA=94=E5=8F=98=E9=87=8F?=
 =?UTF-8?q?=E5=90=8D=E5=92=8C=E7=B1=BB=E5=90=8D=20=20=20=20=20=20=20=20=20?=
 =?UTF-8?q?Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/coderevi?=
 =?UTF-8?q?ew/10491951?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 modelscope/metainfo.py                        |  6 ++---
 modelscope/models/nlp/__init__.py             |  5 ++--
 .../{passage_ranking.py => text_ranking.py}   | 10 ++++----
 .../msdatasets/task_datasets/__init__.py      |  4 +--
 ...ing_dataset.py => text_ranking_dataset.py} | 16 ++++++------
 modelscope/outputs.py                         |  2 +-
 modelscope/pipeline_inputs.py                 |  2 +-
 modelscope/pipelines/builder.py               |  4 +--
 modelscope/pipelines/nlp/__init__.py          |  4 +--
 ...g_pipeline.py => text_ranking_pipeline.py} | 10 ++++----
 modelscope/preprocessors/__init__.py          |  4 +--
 modelscope/preprocessors/nlp/__init__.py      |  4 +--
 modelscope/preprocessors/nlp/nlp_base.py      |  8 +++---
 modelscope/trainers/__init__.py               |  4 +--
 modelscope/trainers/nlp/__init__.py           |  4 +--
 ...ing_trainer.py => text_ranking_trainer.py} | 11 ++++----
 modelscope/utils/constant.py                  |  2 +-
 ...assage_ranking.py => test_text_ranking.py} | 25 +++++++++----------
 ...nking.py => test_finetune_text_ranking.py} | 17 +++++++------
 19 files changed, 72 insertions(+), 70 deletions(-)
 rename modelscope/models/nlp/{passage_ranking.py => text_ranking.py} (90%)
 rename modelscope/msdatasets/task_datasets/{passage_ranking_dataset.py => text_ranking_dataset.py} (90%)
 rename modelscope/pipelines/nlp/{passage_ranking_pipeline.py => text_ranking_pipeline.py} (88%)
 rename modelscope/trainers/nlp/{passage_ranking_trainer.py => text_ranking_trainer.py} (95%)
 rename tests/pipelines/{test_passage_ranking.py => test_text_ranking.py} (70%)
 rename tests/trainers/{test_finetune_passage_ranking.py => test_finetune_text_ranking.py} (90%)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index fa1605de..1d6fd874 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -236,7 +236,7 @@ class Pipelines(object):
     conversational_text_to_sql = 'conversational-text-to-sql'
     table_question_answering_pipeline = 'table-question-answering-pipeline'
     sentence_embedding = 'sentence-embedding'
-    passage_ranking = 'passage-ranking'
+    text_ranking = 'text-ranking'
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
@@ -297,7 +297,7 @@ class Trainers(object):
     dialog_intent_trainer = 'dialog-intent-trainer'
     nlp_base_trainer = 'nlp-base-trainer'
     nlp_veco_trainer = 'nlp-veco-trainer'
-    nlp_passage_ranking_trainer = 'nlp-passage-ranking-trainer'
+    nlp_text_ranking_trainer = 'nlp-text-ranking-trainer'
 
     # audio trainers
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
@@ -343,7 +343,7 @@ class Preprocessors(object):
     zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
     text_error_correction = 'text-error-correction'
     sentence_embedding = 'sentence-embedding'
-    passage_ranking = 'passage-ranking'
+    text_ranking = 'text-ranking'
     sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
     fill_mask = 'fill-mask'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 9e830d17..57222698 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -34,8 +34,9 @@ if TYPE_CHECKING:
                               TaskModelForTextGeneration)
     from .token_classification import SbertForTokenClassification
     from .sentence_embedding import SentenceEmbedding
-    from .passage_ranking import PassageRanking
+    from .text_ranking import TextRanking
     from .T5 import T5ForConditionalGeneration
+
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
@@ -75,7 +76,7 @@ else:
         'token_classification': ['SbertForTokenClassification'],
         'table_question_answering': ['TableQuestionAnswering'],
         'sentence_embedding': ['SentenceEmbedding'],
-        'passage_ranking': ['PassageRanking'],
+        'text_ranking': ['TextRanking'],
         'T5': ['T5ForConditionalGeneration'],
     }
 
diff --git a/modelscope/models/nlp/passage_ranking.py b/modelscope/models/nlp/text_ranking.py
similarity index 90%
rename from modelscope/models/nlp/passage_ranking.py
rename to modelscope/models/nlp/text_ranking.py
index 2a06ce45..5bc0635a 100644
--- a/modelscope/models/nlp/passage_ranking.py
+++ b/modelscope/models/nlp/text_ranking.py
@@ -13,18 +13,18 @@ from modelscope.models.nlp.structbert import SbertPreTrainedModel
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 
-__all__ = ['PassageRanking']
+__all__ = ['TextRanking']
 
 
-@MODELS.register_module(Tasks.passage_ranking, module_name=Models.bert)
-class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
+@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
+class TextRanking(SbertForSequenceClassification, SbertPreTrainedModel):
     base_model_prefix: str = 'bert'
     supports_gradient_checkpointing = True
     _keys_to_ignore_on_load_missing = [r'position_ids']
 
     def __init__(self, config, model_dir, *args, **kwargs):
         if hasattr(config, 'base_model_prefix'):
-            PassageRanking.base_model_prefix = config.base_model_prefix
+            TextRanking.base_model_prefix = config.base_model_prefix
         super().__init__(config, model_dir)
         self.train_batch_size = kwargs.get('train_batch_size', 4)
         self.register_buffer(
@@ -74,7 +74,7 @@ class PassageRanking(SbertForSequenceClassification, SbertPreTrainedModel):
         num_labels = kwargs.get('num_labels', 1)
         model_args = {} if num_labels is None else {'num_labels': num_labels}
 
-        return super(SbertPreTrainedModel, PassageRanking).from_pretrained(
+        return super(SbertPreTrainedModel, TextRanking).from_pretrained(
             pretrained_model_name_or_path=kwargs.get('model_dir'),
             model_dir=kwargs.get('model_dir'),
             **model_args)
diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
index 914c41bf..92764155 100644
--- a/modelscope/msdatasets/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -12,14 +12,14 @@ if TYPE_CHECKING:
     from .movie_scene_segmentation import MovieSceneSegmentationDataset
     from .video_summarization_dataset import VideoSummarizationDataset
     from .image_inpainting import ImageInpaintingDataset
-    from .passage_ranking_dataset import PassageRankingDataset
+    from .text_ranking_dataset import TextRankingDataset
 
 else:
     _import_structure = {
         'base': ['TaskDataset'],
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
-        'passage_ranking_dataset': ['PassageRankingDataset'],
+        'text_ranking_dataset': ['TextRankingDataset'],
         'veco_dataset': ['VecoDataset'],
         'image_instance_segmentation_coco_dataset':
         ['ImageInstanceSegmentationCocoDataset'],
diff --git a/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py b/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
similarity index 90%
rename from modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
rename to modelscope/msdatasets/task_datasets/text_ranking_dataset.py
index 517e0d36..dd44f7c2 100644
--- a/modelscope/msdatasets/task_datasets/passage_ranking_dataset.py
+++ b/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
@@ -16,8 +16,8 @@ from .torch_base_dataset import TorchTaskDataset
 
 
 @TASK_DATASETS.register_module(
-    group_key=Tasks.passage_ranking, module_name=Models.bert)
-class PassageRankingDataset(TorchTaskDataset):
+    group_key=Tasks.text_ranking, module_name=Models.bert)
+class TextRankingDataset(TorchTaskDataset):
 
     def __init__(self,
                  datasets: Union[Any, List[Any]],
@@ -35,8 +35,8 @@ class PassageRankingDataset(TorchTaskDataset):
                                                     'positive_passages')
         self.neg_sequence = self.dataset_config.get('neg_sequence',
                                                     'negative_passages')
-        self.passage_text_fileds = self.dataset_config.get(
-            'passage_text_fileds', ['title', 'text'])
+        self.text_fileds = self.dataset_config.get('text_fileds',
+                                                   ['title', 'text'])
         self.qid_field = self.dataset_config.get('qid_field', 'query_id')
         if mode == ModeKeys.TRAIN:
             train_config = kwargs.get('train', {})
@@ -58,14 +58,14 @@ class PassageRankingDataset(TorchTaskDataset):
 
         pos_sequences = group[self.pos_sequence]
         pos_sequences = [
-            ' '.join([ele[key] for key in self.passage_text_fileds])
+            ' '.join([ele[key] for key in self.text_fileds])
             for ele in pos_sequences
         ]
         labels.extend([1] * len(pos_sequences))
 
         neg_sequences = group[self.neg_sequence]
         neg_sequences = [
-            ' '.join([ele[key] for key in self.passage_text_fileds])
+            ' '.join([ele[key] for key in self.text_fileds])
             for ele in neg_sequences
         ]
 
@@ -88,13 +88,13 @@ class PassageRankingDataset(TorchTaskDataset):
 
         pos_sequences = group[self.pos_sequence]
         pos_sequences = [
-            ' '.join([ele[key] for key in self.passage_text_fileds])
+            ' '.join([ele[key] for key in self.text_fileds])
             for ele in pos_sequences
         ]
 
         neg_sequences = group[self.neg_sequence]
         neg_sequences = [
-            ' '.join([ele[key] for key in self.passage_text_fileds])
+            ' '.join([ele[key] for key in self.text_fileds])
             for ele in neg_sequences
         ]
 
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index af37eb84..13d440ca 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -506,7 +506,7 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_error_correction: [OutputKeys.OUTPUT],
     Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
-    Tasks.passage_ranking: [OutputKeys.SCORES],
+    Tasks.text_ranking: [OutputKeys.SCORES],
 
     # text generation result for single sample
     # {
diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index 34b731c6..77940c3c 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -162,7 +162,7 @@ TASK_INPUTS = {
         'source_sentence': InputType.LIST,
         'sentences_to_compare': InputType.LIST,
     },
-    Tasks.passage_ranking: (InputType.TEXT, InputType.TEXT),
+    Tasks.text_ranking: (InputType.TEXT, InputType.TEXT),
     Tasks.text_generation:
     InputType.TEXT,
     Tasks.fill_mask:
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 8098bdec..f183afc1 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -20,8 +20,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.sentence_embedding:
     (Pipelines.sentence_embedding,
      'damo/nlp_corom_sentence-embedding_english-base'),
-    Tasks.passage_ranking: (Pipelines.passage_ranking,
-                            'damo/nlp_corom_passage-ranking_english-base'),
+    Tasks.text_ranking: (Pipelines.text_ranking,
+                         'damo/nlp_corom_passage-ranking_english-base'),
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index be854593..677151c0 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
     from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
-    from .passage_ranking_pipeline import PassageRankingPipeline
+    from .text_ranking_pipeline import TextRankingPipeline
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .sequence_classification_pipeline import SequenceClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
@@ -51,7 +51,7 @@ else:
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'named_entity_recognition_pipeline':
         ['NamedEntityRecognitionPipeline'],
-        'passage_ranking_pipeline': ['PassageRankingPipeline'],
+        'text_ranking_pipeline': ['TextRankingPipeline'],
         'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
         'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
diff --git a/modelscope/pipelines/nlp/passage_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py
similarity index 88%
rename from modelscope/pipelines/nlp/passage_ranking_pipeline.py
rename to modelscope/pipelines/nlp/text_ranking_pipeline.py
index 1d818ac0..4aa57238 100644
--- a/modelscope/pipelines/nlp/passage_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py
@@ -9,15 +9,15 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import PassageRankingPreprocessor, Preprocessor
+from modelscope.preprocessors import Preprocessor, TextRankingPreprocessor
 from modelscope.utils.constant import Tasks
 
-__all__ = ['PassageRankingPipeline']
+__all__ = ['TextRankingPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.passage_ranking, module_name=Pipelines.passage_ranking)
-class PassageRankingPipeline(Pipeline):
+    Tasks.text_ranking, module_name=Pipelines.text_ranking)
+class TextRankingPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
@@ -36,7 +36,7 @@ class PassageRankingPipeline(Pipeline):
                                     Model) else Model.from_pretrained(model)
 
         if preprocessor is None:
-            preprocessor = PassageRankingPreprocessor(
+            preprocessor = TextRankingPreprocessor(
                 model.model_dir if isinstance(model, Model) else model,
                 sequence_length=kwargs.pop('sequence_length', 128))
         model.eval()
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index f7defd92..63302aa7 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
         FillMaskPoNetPreprocessor,
         NLPPreprocessor,
         NLPTokenizerPreprocessorBase,
-        PassageRankingPreprocessor,
+        TextRankingPreprocessor,
         RelationExtractionPreprocessor,
         SentenceEmbeddingPreprocessor,
         SequenceClassificationPreprocessor,
@@ -62,7 +62,7 @@ else:
             'FillMaskPoNetPreprocessor',
             'NLPPreprocessor',
             'NLPTokenizerPreprocessorBase',
-            'PassageRankingPreprocessor',
+            'TextRankingPreprocessor',
             'RelationExtractionPreprocessor',
             'SentenceEmbeddingPreprocessor',
             'SequenceClassificationPreprocessor',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index f7478329..b95048ba 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
         FillMaskPoNetPreprocessor,
         NLPPreprocessor,
         NLPTokenizerPreprocessorBase,
-        PassageRankingPreprocessor,
+        TextRankingPreprocessor,
         RelationExtractionPreprocessor,
         SentenceEmbeddingPreprocessor,
         SequenceClassificationPreprocessor,
@@ -33,7 +33,7 @@ else:
             'FillMaskPoNetPreprocessor',
             'NLPPreprocessor',
             'NLPTokenizerPreprocessorBase',
-            'PassageRankingPreprocessor',
+            'TextRankingPreprocessor',
             'RelationExtractionPreprocessor',
             'SentenceEmbeddingPreprocessor',
             'SequenceClassificationPreprocessor',
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 9049ec99..6075a4b3 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -29,7 +29,7 @@ __all__ = [
     'NLPPreprocessor',
     'FillMaskPoNetPreprocessor',
     'NLPTokenizerPreprocessorBase',
-    'PassageRankingPreprocessor',
+    'TextRankingPreprocessor',
     'RelationExtractionPreprocessor',
     'SentenceEmbeddingPreprocessor',
     'SequenceClassificationPreprocessor',
@@ -245,9 +245,9 @@ class NLPPreprocessor(NLPTokenizerPreprocessorBase):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.passage_ranking)
-class PassageRankingPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in passage ranking model.
+    Fields.nlp, module_name=Preprocessors.text_ranking)
+class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text-ranking model.
     """
 
     def __init__(self,
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 86917261..dbfe5ba7 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
                      ImagePortraitEnhancementTrainer,
                      MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
     from .multi_modal import CLIPTrainer
-    from .nlp import SequenceClassificationTrainer, PassageRankingTrainer
+    from .nlp import SequenceClassificationTrainer, TextRankingTrainer
     from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
     from .trainer import EpochBasedTrainer
 
@@ -26,7 +26,7 @@ else:
             'ImageInpaintingTrainer'
         ],
         'multi_modal': ['CLIPTrainer'],
-        'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'],
+        'nlp': ['SequenceClassificationTrainer', 'TextRankingTrainer'],
         'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
         'trainer': ['EpochBasedTrainer']
     }
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index 001cfefc..7f1bcd63 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -6,12 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .sequence_classification_trainer import SequenceClassificationTrainer
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
-    from .passage_ranking_trainer import PassageRankingTranier
+    from .text_ranking_trainer import TextRankingTranier
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
         'csanmt_translation_trainer': ['CsanmtTranslationTrainer'],
-        'passage_ranking_trainer': ['PassageRankingTrainer']
+        'text_ranking_trainer': ['TextRankingTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/nlp/passage_ranking_trainer.py b/modelscope/trainers/nlp/text_ranking_trainer.py
similarity index 95%
rename from modelscope/trainers/nlp/passage_ranking_trainer.py
rename to modelscope/trainers/nlp/text_ranking_trainer.py
index 711fd0c4..5da9c76a 100644
--- a/modelscope/trainers/nlp/passage_ranking_trainer.py
+++ b/modelscope/trainers/nlp/text_ranking_trainer.py
@@ -8,6 +8,7 @@ import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
 
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model, TorchModel
@@ -42,8 +43,8 @@ class GroupCollator():
         return batch
 
 
-@TRAINERS.register_module(module_name=Trainers.nlp_passage_ranking_trainer)
-class PassageRankingTrainer(NlpEpochBasedTrainer):
+@TRAINERS.register_module(module_name=Trainers.nlp_text_ranking_trainer)
+class TextRankingTrainer(NlpEpochBasedTrainer):
 
     def __init__(
             self,
@@ -117,7 +118,7 @@ class PassageRankingTrainer(NlpEpochBasedTrainer):
             Example:
             {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
         """
-        from modelscope.models.nlp import PassageRanking
+        from modelscope.models.nlp import TextRanking
         # get the raw online dataset
         self.eval_dataloader = self._build_dataloader_with_dataset(
             self.eval_dataset,
@@ -126,7 +127,7 @@ class PassageRankingTrainer(NlpEpochBasedTrainer):
         # generate a standard dataloader
         # generate a model
         if checkpoint_path is not None:
-            model = PassageRanking.from_pretrained(checkpoint_path)
+            model = TextRanking.from_pretrained(checkpoint_path)
         else:
             model = self.model
 
@@ -141,7 +142,7 @@ class PassageRankingTrainer(NlpEpochBasedTrainer):
         total_spent_time = 0.0
         device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
         model.to(device)
-        for _step, batch in enumerate(self.eval_dataloader):
+        for _step, batch in enumerate(tqdm(self.eval_dataloader)):
             try:
                 batch = {
                     key:
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 8e986b61..87a0a417 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -103,7 +103,7 @@ class NLPTasks(object):
     sentence_similarity = 'sentence-similarity'
     text_classification = 'text-classification'
     sentence_embedding = 'sentence-embedding'
-    passage_ranking = 'passage-ranking'
+    text_ranking = 'text-ranking'
     relation_extraction = 'relation-extraction'
     zero_shot = 'zero-shot'
     translation = 'translation'
diff --git a/tests/pipelines/test_passage_ranking.py b/tests/pipelines/test_text_ranking.py
similarity index 70%
rename from tests/pipelines/test_passage_ranking.py
rename to tests/pipelines/test_text_ranking.py
index 5faa365e..ece3c617 100644
--- a/tests/pipelines/test_passage_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -4,15 +4,15 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import PassageRanking
+from modelscope.models.nlp import TextRanking
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import PassageRankingPipeline
-from modelscope.preprocessors import PassageRankingPreprocessor
+from modelscope.pipelines.nlp import TextRankingPipeline
+from modelscope.preprocessors import TextRankingPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 
-class PassageRankingTest(unittest.TestCase):
+class TextRankingTest(unittest.TestCase):
     model_id = 'damo/nlp_corom_passage-ranking_english-base'
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
@@ -27,11 +27,11 @@ class PassageRankingTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
-        tokenizer = PassageRankingPreprocessor(cache_path)
-        model = PassageRanking.from_pretrained(cache_path)
-        pipeline1 = PassageRankingPipeline(model, preprocessor=tokenizer)
+        tokenizer = TextRankingPreprocessor(cache_path)
+        model = TextRanking.from_pretrained(cache_path)
+        pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+            Tasks.text_ranking, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.inputs}\n'
               f'pipeline1:{pipeline1(input=self.inputs)}')
         print()
@@ -40,20 +40,19 @@ class PassageRankingTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        tokenizer = PassageRankingPreprocessor(model.model_dir)
+        tokenizer = TextRankingPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
-            task=Tasks.passage_ranking, model=model, preprocessor=tokenizer)
+            task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipeline_ins = pipeline(
-            task=Tasks.passage_ranking, model=self.model_id)
+        pipeline_ins = pipeline(task=Tasks.text_ranking, model=self.model_id)
         print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.passage_ranking)
+        pipeline_ins = pipeline(task=Tasks.text_ranking)
         print(pipeline_ins(input=self.inputs))
 
 
diff --git a/tests/trainers/test_finetune_passage_ranking.py b/tests/trainers/test_finetune_text_ranking.py
similarity index 90%
rename from tests/trainers/test_finetune_passage_ranking.py
rename to tests/trainers/test_finetune_text_ranking.py
index f833f981..e603bff2 100644
--- a/tests/trainers/test_finetune_passage_ranking.py
+++ b/tests/trainers/test_finetune_text_ranking.py
@@ -41,7 +41,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                  model_id,
                  train_dataset,
                  eval_dataset,
-                 name=Trainers.nlp_passage_ranking_trainer,
+                 name=Trainers.nlp_text_ranking_trainer,
                  cfg_modify_fn=None,
                  **kwargs):
         kwargs = dict(
@@ -61,8 +61,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
     def test_finetune_msmarco(self):
 
         def cfg_modify_fn(cfg):
-            cfg.task = 'passage-ranking'
-            cfg['preprocessor'] = {'type': 'passage-ranking'}
+            cfg.task = 'text-ranking'
+            cfg['preprocessor'] = {'type': 'text-ranking'}
             cfg.train.optimizer.lr = 2e-5
             cfg['dataset'] = {
                 'train': {
@@ -105,7 +105,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             }, {
                 'type': 'EvaluationHook',
                 'by_epoch': False,
-                'interval': 3000
+                'interval': 15
             }]
             return cfg
 
@@ -114,18 +114,19 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         train_ds = ds['train'].to_hf_dataset()
         dev_ds = ds['train'].to_hf_dataset()
 
+        model_id = 'damo/nlp_corom_passage-ranking_english-base'
         self.finetune(
-            model_id='damo/nlp_corom_passage-ranking_english-base',
+            model_id=model_id,
             train_dataset=train_ds,
             eval_dataset=dev_ds,
             cfg_modify_fn=cfg_modify_fn)
 
         output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
-        self.pipeline_passage_ranking(output_dir)
+        self.pipeline_text_ranking(output_dir)
 
-    def pipeline_passage_ranking(self, model_dir):
+    def pipeline_text_ranking(self, model_dir):
         model = Model.from_pretrained(model_dir)
-        pipeline_ins = pipeline(task=Tasks.passage_ranking, model=model)
+        pipeline_ins = pipeline(task=Tasks.text_ranking, model=model)
         print(pipeline_ins(input=self.inputs))
 
 
From e09d277fd3f53eaa6b3f2288e787ffc8b1f922b3 Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Sat, 22 Oct 2022 19:19:23 +0800
Subject: [PATCH 093/129] [to #42322933] fix cpu inference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复cpu推理
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10468823
---
 .../models/cv/face_human_hand_detection/one_stage_detector.py  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
index c1d0a52f..0d1cd15d 100644
--- a/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
+++ b/modelscope/models/cv/face_human_hand_detection/one_stage_detector.py
@@ -56,9 +56,6 @@ class OneStageDetector(nn.Module):
 
     def inference(self, meta):
         with torch.no_grad():
-            torch.cuda.synchronize()
             preds = self(meta['img'])
-            torch.cuda.synchronize()
             results = self.head.post_process(preds, meta)
-            torch.cuda.synchronize()
         return results

From 1854ceeb74466c0a69766447d2dd1da89005e0ed Mon Sep 17 00:00:00 2001
From: "shichen.fsc" <shichen.fsc@alibaba-inc.com>
Date: Sat, 22 Oct 2022 20:30:45 +0800
Subject: [PATCH 094/129] [to #42322933] Fix all asr models in UT with mistake
 model_id         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10491024

---
 .../test_automatic_speech_recognition.py      | 87 +++++++------------
 1 file changed, 32 insertions(+), 55 deletions(-)

diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index c37a6a3f..b6532868 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -80,164 +80,141 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
 
     all_models_info = [
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
+            'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
-            'model_id': 'speech_paraformer_asr_nat-aishell1-pytorch',
+            'model_id': 'damo/speech_paraformer_asr_nat-aishell1-pytorch',
+            'wav_path': 'data/test/audios/asr_example.wav'
+        },
+        {
+            'model_id': 'damo/speech_paraformer_asr_nat-aishell2-pytorch',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
+            'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1',
+            'damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1',
             'wav_path': 'data/test/audios/asr_example_8K.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_8K.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_8K.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
+            'damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_cn_en.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_cn_en.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_cn_dialect.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_cn_dialect.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online',
+            'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online',
+            'damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_8K.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_en.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_en.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_ru.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_ru.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_es.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_es.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_ko.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_ko.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_ja.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_ja.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online',
+            'damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_id.wav'
         },
         {
-            'model_group': 'damo',
             'model_id':
-            'speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline',
+            'damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline',
             'wav_path': 'data/test/audios/asr_example_id.wav'
         },
     ]
@@ -404,7 +381,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
         logger.info('Run ASR test with all models')
 
         for item in self.all_models_info:
-            model_id = item['model_group'] + '/' + item['model_id']
+            model_id = item['model_id']
             wav_path = item['wav_path']
             rec_result = self.run_pipeline(
                 model_id=model_id, audio_in=wav_path)

From 46107e3ecf129b155dac7de57edddbb1b1686113 Mon Sep 17 00:00:00 2001
From: "baiguan.yt" <baiguan.yt@alibaba-inc.com>
Date: Sat, 22 Oct 2022 20:31:59 +0800
Subject: [PATCH 095/129] [to #42322933]converting string to int to meet the
 input of face-image-generation         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10489981

---
 modelscope/pipelines/cv/face_image_generation_pipeline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modelscope/pipelines/cv/face_image_generation_pipeline.py b/modelscope/pipelines/cv/face_image_generation_pipeline.py
index f00d639e..1b4e2e8a 100644
--- a/modelscope/pipelines/cv/face_image_generation_pipeline.py
+++ b/modelscope/pipelines/cv/face_image_generation_pipeline.py
@@ -61,6 +61,8 @@ class FaceImageGenerationPipeline(Pipeline):
         return input
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if isinstance(input, str):
+            input = int(input)
         assert isinstance(input, int)
         torch.manual_seed(input)
         torch.cuda.manual_seed(input)

From 9edfd7e50c86c1a333f8e2dd9724e1060a1f0a66 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Sat, 22 Oct 2022 20:33:49 +0800
Subject: [PATCH 096/129] [to #42322933] update tableqa params
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 增加传入table_id
2. 将result和table的结构统一
3. 默认开启is_use_sqlite
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10492027
---
 .../nlp/table_question_answering_pipeline.py  | 51 +++++++++--------
 .../preprocessors/star3/fields/database.py    |  2 +-
 .../preprocessors/star3/fields/schema_link.py | 31 ++++++-----
 .../table_question_answering_preprocessor.py  |  2 +
 .../test_table_question_answering.py          | 55 +++++++++++++++++--
 5 files changed, 96 insertions(+), 45 deletions(-)

diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index ca17c9b1..08501953 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -72,6 +72,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
         action = self.action_ops[result['action']]
         headers = table['header_name']
         current_sql = result['sql']
+        current_sql['from'] = [table['table_id']]
 
         if history_sql is None:
             return current_sql
@@ -216,10 +217,11 @@ class TableQuestionAnsweringPipeline(Pipeline):
         else:
             return current_sql
 
-    def sql_dict_to_str(self, result, table):
+    def sql_dict_to_str(self, result, tables):
         """
         convert sql struct to string
         """
+        table = tables[result['sql']['from'][0]]
         header_names = table['header_name'] + ['空列']
         header_ids = table['header_id'] + ['null']
         sql = result['sql']
@@ -279,42 +281,43 @@ class TableQuestionAnsweringPipeline(Pipeline):
         """
         result = inputs['result']
         history_sql = inputs['history_sql']
-        result['sql'] = self.post_process_multi_turn(
-            history_sql=history_sql,
-            result=result,
-            table=self.db.tables[result['table_id']])
-        result['sql']['from'] = [result['table_id']]
-        sql = self.sql_dict_to_str(
-            result=result, table=self.db.tables[result['table_id']])
+        try:
+            result['sql'] = self.post_process_multi_turn(
+                history_sql=history_sql,
+                result=result,
+                table=self.db.tables[result['table_id']])
+        except Exception:
+            result['sql'] = history_sql
+        sql = self.sql_dict_to_str(result=result, tables=self.db.tables)
 
         # add sqlite
         if self.db.is_use_sqlite:
             try:
                 cursor = self.db.connection_obj.cursor().execute(sql.query)
-                names = [{
-                    'name':
-                    description[0],
-                    'label':
-                    self.db.tables[result['table_id']]['headerid2name'].get(
-                        description[0], description[0])
-                } for description in cursor.description]
-                cells = []
+                header_ids, header_names = [], []
+                for description in cursor.description:
+                    header_ids.append(self.db.tables[result['table_id']]
+                                      ['headerid2name'].get(
+                                          description[0], description[0]))
+                    header_names.append(description[0])
+                rows = []
                 for res in cursor.fetchall():
-                    row = {}
-                    for name, cell in zip(names, res):
-                        row[name['name']] = cell
-                    cells.append(row)
-                tabledata = {'headers': names, 'cells': cells}
+                    rows.append(list(res))
+                tabledata = {
+                    'header_id': header_ids,
+                    'header_name': header_names,
+                    'rows': rows
+                }
             except Exception:
-                tabledata = {'headers': [], 'cells': []}
+                tabledata = {'header_id': [], 'header_name': [], 'rows': []}
         else:
-            tabledata = {'headers': [], 'cells': []}
+            tabledata = {'header_id': [], 'header_name': [], 'rows': []}
 
         output = {
             OutputKeys.SQL_STRING: sql.string,
             OutputKeys.SQL_QUERY: sql.query,
             OutputKeys.HISTORY: result['sql'],
-            OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False),
+            OutputKeys.QUERT_RESULT: tabledata,
         }
 
         return output
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py
index 3d3a1f8d..5debfe2c 100644
--- a/modelscope/preprocessors/star3/fields/database.py
+++ b/modelscope/preprocessors/star3/fields/database.py
@@ -13,7 +13,7 @@ class Database:
                  tokenizer,
                  table_file_path,
                  syn_dict_file_path,
-                 is_use_sqlite=False):
+                 is_use_sqlite=True):
         self.tokenizer = tokenizer
         self.is_use_sqlite = is_use_sqlite
         if self.is_use_sqlite:
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py
index 7f483a1f..220a71d8 100644
--- a/modelscope/preprocessors/star3/fields/schema_link.py
+++ b/modelscope/preprocessors/star3/fields/schema_link.py
@@ -293,6 +293,7 @@ class SchemaLinker:
                            nlu_t,
                            tables,
                            col_syn_dict,
+                           table_id=None,
                            history_sql=None):
         """
         get linking between question and schema column
@@ -300,6 +301,9 @@ class SchemaLinker:
         typeinfos = []
         numbers = re.findall(r'[-]?\d*\.\d+|[-]?\d+|\d+', nlu)
 
+        if table_id is not None and table_id in tables:
+            tables = {table_id: tables[table_id]}
+
         # search schema link in every table
         search_result_list = []
         for tablename in tables:
@@ -411,26 +415,25 @@ class SchemaLinker:
             # get the match score of each table
             match_score = self.get_table_match_score(nlu_t, schema_link)
 
+            # cal table_score
+            if history_sql is not None and 'from' in history_sql:
+                table_score = int(table['table_id'] == history_sql['from'][0])
+            else:
+                table_score = 0
+
             search_result = {
-                'table_id':
-                table['table_id'],
-                'question_knowledge':
-                final_question,
-                'header_knowledge':
-                final_header,
-                'schema_link':
-                schema_link,
-                'match_score':
-                match_score,
-                'table_score':
-                int(table['table_id'] == history_sql['from'][0])
-                if history_sql is not None else 0
+                'table_id': table['table_id'],
+                'question_knowledge': final_question,
+                'header_knowledge': final_header,
+                'schema_link': schema_link,
+                'match_score': match_score,
+                'table_score': table_score
             }
             search_result_list.append(search_result)
 
         search_result_list = sorted(
             search_result_list,
             key=lambda x: (x['match_score'], x['table_score']),
-            reverse=True)[0:4]
+            reverse=True)[0:1]
 
         return search_result_list
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
index f98aa6d0..ed2911f6 100644
--- a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
@@ -95,6 +95,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
 
         # tokenize question
         question = data['question']
+        table_id = data.get('table_id', None)
         history_sql = data.get('history_sql', None)
         nlu = question.lower()
         nlu_t = self.tokenizer.tokenize(nlu)
@@ -106,6 +107,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor):
             nlu_t=nlu_t,
             tables=self.db.tables,
             col_syn_dict=self.db.syn_dict,
+            table_id=table_id,
             history_sql=history_sql)
 
         # collect data
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 3d943e51..571ca795 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -43,7 +43,7 @@ def tableqa_tracking_and_print_results_with_history(
             print('sql text:', output_dict[OutputKeys.SQL_STRING])
             print('sql query:', output_dict[OutputKeys.SQL_QUERY])
             print('query result:', output_dict[OutputKeys.QUERT_RESULT])
-            print('json dumps', json.dumps(output_dict))
+            print('json dumps', json.dumps(output_dict, ensure_ascii=False))
             print()
             historical_queries = output_dict[OutputKeys.HISTORY]
 
@@ -66,10 +66,42 @@ def tableqa_tracking_and_print_results_without_history(
             print('sql text:', output_dict[OutputKeys.SQL_STRING])
             print('sql query:', output_dict[OutputKeys.SQL_QUERY])
             print('query result:', output_dict[OutputKeys.QUERT_RESULT])
-            print('json dumps', json.dumps(output_dict))
+            print('json dumps', json.dumps(output_dict, ensure_ascii=False))
             print()
 
 
+def tableqa_tracking_and_print_results_with_tableid(
+        pipelines: List[TableQuestionAnsweringPipeline]):
+    test_case = {
+        'utterance': [
+            ['有哪些风险类型？', 'fund'],
+            ['风险类型有多少种？', 'reservoir'],
+            ['珠江流域的小(2)型水库的库容总量是多少？', 'reservoir'],
+            ['那平均值是多少？', 'reservoir'],
+            ['那水库的名称呢？', 'reservoir'],
+            ['换成中型的呢？', 'reservoir'],
+            ['枣庄营业厅的电话', 'business'],
+            ['那地址呢？', 'business'],
+            ['枣庄营业厅的电话和地址', 'business'],
+        ],
+    }
+    for p in pipelines:
+        historical_queries = None
+        for question, table_id in test_case['utterance']:
+            output_dict = p({
+                'question': question,
+                'table_id': table_id,
+                'history_sql': historical_queries
+            })
+            print('question', question)
+            print('sql text:', output_dict[OutputKeys.SQL_STRING])
+            print('sql query:', output_dict[OutputKeys.SQL_QUERY])
+            print('query result:', output_dict[OutputKeys.QUERT_RESULT])
+            print('json dumps', json.dumps(output_dict, ensure_ascii=False))
+            print()
+            historical_queries = output_dict[OutputKeys.HISTORY]
+
+
 class TableQuestionAnswering(unittest.TestCase):
 
     def setUp(self) -> None:
@@ -93,15 +125,27 @@ class TableQuestionAnswering(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
+        self.tokenizer = BertTokenizer(
+            os.path.join(model.model_dir, ModelFile.VOCAB_FILE))
+        db = Database(
+            tokenizer=self.tokenizer,
+            table_file_path=[
+                os.path.join(model.model_dir, 'databases', fname)
+                for fname in os.listdir(
+                    os.path.join(model.model_dir, 'databases'))
+            ],
+            syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'),
+            is_use_sqlite=False)
         preprocessor = TableQuestionAnsweringPreprocessor(
-            model_dir=model.model_dir)
+            model_dir=model.model_dir, db=db)
         pipelines = [
             pipeline(
                 Tasks.table_question_answering,
                 model=model,
-                preprocessor=preprocessor)
+                preprocessor=preprocessor,
+                db=db)
         ]
-        tableqa_tracking_and_print_results_with_history(pipelines)
+        tableqa_tracking_and_print_results_with_tableid(pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_task(self):
@@ -132,7 +176,6 @@ class TableQuestionAnswering(unittest.TestCase):
                 db=db)
         ]
         tableqa_tracking_and_print_results_without_history(pipelines)
-        tableqa_tracking_and_print_results_with_history(pipelines)
 
 
 if __name__ == '__main__':

From 2a87dee561a04d15e00e5c3f7be5af1be0362098 Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Sat, 22 Oct 2022 21:09:15 +0800
Subject: [PATCH 097/129] [to #42322933]support multi tasks-- will be failed,
 since configuration has not changed yet         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10492024

---
 .../models/nlp/heads/infromation_extraction_head.py    |  2 ++
 .../models/nlp/task_models/information_extraction.py   |  2 ++
 modelscope/pipelines/builder.py                        |  3 +++
 .../pipelines/nlp/information_extraction_pipeline.py   |  2 ++
 tests/pipelines/test_relation_extraction.py            | 10 +++++-----
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/modelscope/models/nlp/heads/infromation_extraction_head.py b/modelscope/models/nlp/heads/infromation_extraction_head.py
index 6c3388f0..626f1b59 100644
--- a/modelscope/models/nlp/heads/infromation_extraction_head.py
+++ b/modelscope/models/nlp/heads/infromation_extraction_head.py
@@ -10,6 +10,8 @@ from modelscope.utils.constant import Tasks
 
 @HEADS.register_module(
     Tasks.information_extraction, module_name=Heads.information_extraction)
+@HEADS.register_module(
+    Tasks.relation_extraction, module_name=Heads.information_extraction)
 class InformationExtractionHead(TorchHead):
 
     def __init__(self, **kwargs):
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index 0a7d5a47..a206c2fc 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -16,6 +16,8 @@ __all__ = ['InformationExtractionModel']
 @MODELS.register_module(
     Tasks.information_extraction,
     module_name=TaskModels.information_extraction)
+@MODELS.register_module(
+    Tasks.relation_extraction, module_name=TaskModels.information_extraction)
 class InformationExtractionModel(SingleBackboneTaskModelBase):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f183afc1..aaea0bb6 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -31,6 +31,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.named_entity_recognition:
     (Pipelines.named_entity_recognition,
      'damo/nlp_raner_named-entity-recognition_chinese-base-news'),
+    Tasks.relation_extraction:
+    (Pipelines.relation_extraction,
+     'damo/nlp_bert_relation-extraction_chinese-base'),
     Tasks.information_extraction:
     (Pipelines.relation_extraction,
      'damo/nlp_bert_relation-extraction_chinese-base'),
diff --git a/modelscope/pipelines/nlp/information_extraction_pipeline.py b/modelscope/pipelines/nlp/information_extraction_pipeline.py
index 763e941c..8ac85f43 100644
--- a/modelscope/pipelines/nlp/information_extraction_pipeline.py
+++ b/modelscope/pipelines/nlp/information_extraction_pipeline.py
@@ -17,6 +17,8 @@ __all__ = ['InformationExtractionPipeline']
 
 @PIPELINES.register_module(
     Tasks.information_extraction, module_name=Pipelines.relation_extraction)
+@PIPELINES.register_module(
+    Tasks.relation_extraction, module_name=Pipelines.relation_extraction)
 class InformationExtractionPipeline(Pipeline):
 
     def __init__(self,
diff --git a/tests/pipelines/test_relation_extraction.py b/tests/pipelines/test_relation_extraction.py
index 57d98f66..561eaf21 100644
--- a/tests/pipelines/test_relation_extraction.py
+++ b/tests/pipelines/test_relation_extraction.py
@@ -15,7 +15,7 @@ from modelscope.utils.test_utils import test_level
 class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.information_extraction
+        self.task = Tasks.relation_extraction
         self.model_id = 'damo/nlp_bert_relation-extraction_chinese-base'
 
     sentence = '高捷，祖籍江苏，本科毕业于东南大学'
@@ -28,7 +28,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline1 = InformationExtractionPipeline(
             model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.information_extraction, model=model, preprocessor=tokenizer)
+            Tasks.relation_extraction, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.sentence}\n'
               f'pipeline1:{pipeline1(input=self.sentence)}')
         print()
@@ -39,7 +39,7 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
         model = Model.from_pretrained(self.model_id)
         tokenizer = RelationExtractionPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
-            task=Tasks.information_extraction,
+            task=Tasks.relation_extraction,
             model=model,
             preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence))
@@ -47,12 +47,12 @@ class RelationExtractionTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.information_extraction, model=self.model_id)
+            task=Tasks.relation_extraction, model=self.model_id)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.information_extraction)
+        pipeline_ins = pipeline(task=Tasks.relation_extraction)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')

From 707cbef013f903d6854548603209e41777ab05a3 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sat, 22 Oct 2022 23:25:18 +0800
Subject: [PATCH 098/129] [to #42322933]Fix bug in daily UT         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10491891

---
 ...st_export_sbert_sequence_classification.py |   2 +-
 tests/msdatasets/test_ms_dataset.py           |   4 +-
 tests/pipelines/test_gpt3_text_generation.py  |   4 +-
 tests/pipelines/test_text_classification.py   | 100 ------------------
 .../test_finetune_sequence_classification.py  |   3 +-
 tests/trainers/test_trainer_with_nlp.py       |  21 +++-
 6 files changed, 24 insertions(+), 110 deletions(-)
 delete mode 100644 tests/pipelines/test_text_classification.py

diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
index 535b3f5d..97926539 100644
--- a/tests/export/test_export_sbert_sequence_classification.py
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -22,7 +22,7 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skip
     def test_export_sbert_sequence_classification(self):
         model = Model.from_pretrained(self.model_id)
         print(
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 1e537e93..dff411f6 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -71,7 +71,7 @@ class MsDatasetTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     @require_torch
     def test_to_torch_dataset_text(self):
-        model_id = 'damo/bert-base-sst2'
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
         nlp_model = Model.from_pretrained(model_id)
         preprocessor = SequenceClassificationPreprocessor(
             nlp_model.model_dir,
@@ -93,7 +93,7 @@ class MsDatasetTest(unittest.TestCase):
     def test_to_tf_dataset_text(self):
         import tensorflow as tf
         tf.compat.v1.enable_eager_execution()
-        model_id = 'damo/bert-base-sst2'
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
         nlp_model = Model.from_pretrained(model_id)
         preprocessor = SequenceClassificationPreprocessor(
             nlp_model.model_dir,
diff --git a/tests/pipelines/test_gpt3_text_generation.py b/tests/pipelines/test_gpt3_text_generation.py
index 413b5874..674e95bb 100644
--- a/tests/pipelines/test_gpt3_text_generation.py
+++ b/tests/pipelines/test_gpt3_text_generation.py
@@ -17,12 +17,12 @@ class TextGPT3GenerationTest(unittest.TestCase):
         self.model_dir_13B = snapshot_download(self.model_id_13B)
         self.input = '好的'
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('distributed gpt3 1.3B, skipped')
     def test_gpt3_1_3B(self):
         pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
         print(pipe(self.input))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('distributed gpt3 2.7B, skipped')
     def test_gpt3_2_7B(self):
         pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B)
         print(pipe(self.input))
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
deleted file mode 100644
index 39dbac99..00000000
--- a/tests/pipelines/test_text_classification.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import unittest
-
-from modelscope.models import Model
-from modelscope.msdatasets import MsDataset
-from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
-from modelscope.preprocessors import SequenceClassificationPreprocessor
-from modelscope.utils.constant import Tasks
-from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.test_utils import test_level
-
-
-class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
-    sentence1 = 'i like this wonderful place'
-
-    def setUp(self) -> None:
-        self.model_id = 'damo/bert-base-sst2'
-        self.task = Tasks.text_classification
-
-    def predict(self, pipeline_ins: SequenceClassificationPipeline):
-        from easynlp.appzoo import load_dataset
-
-        set = load_dataset('glue', 'sst2')
-        data = set['test']['sentence'][:3]
-
-        results = pipeline_ins(data[0])
-        print(results)
-        results = pipeline_ins(data[1])
-        print(results)
-
-        print(data)
-
-    def printDataset(self, dataset: MsDataset):
-        for i, r in enumerate(dataset):
-            if i > 10:
-                break
-            print(r)
-
-    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    @unittest.skip('nlp model does not support tensor input, skipped')
-    def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        preprocessor = SequenceClassificationPreprocessor(
-            model.model_dir, first_sequence='sentence', second_sequence=None)
-        pipeline_ins = pipeline(
-            task=Tasks.text_classification,
-            model=model,
-            preprocessor=preprocessor)
-        print(f'sentence1: {self.sentence1}\n'
-              f'pipeline1:{pipeline_ins(input=self.sentence1)}')
-
-    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    @unittest.skip('nlp model does not support tensor input, skipped')
-    def test_run_with_model_name(self):
-        text_classification = pipeline(
-            task=Tasks.text_classification, model=self.model_id)
-        result = text_classification(
-            MsDataset.load(
-                'xcopa',
-                subset_name='translation-et',
-                namespace='damotest',
-                split='test',
-                target='premise'))
-        self.printDataset(result)
-
-    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    @unittest.skip('nlp model does not support tensor input, skipped')
-    def test_run_with_default_model(self):
-        text_classification = pipeline(task=Tasks.text_classification)
-        result = text_classification(
-            MsDataset.load(
-                'xcopa',
-                subset_name='translation-et',
-                namespace='damotest',
-                split='test',
-                target='premise'))
-        self.printDataset(result)
-
-    # @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    @unittest.skip('nlp model does not support tensor input, skipped')
-    def test_run_with_modelscope_dataset(self):
-        text_classification = pipeline(task=Tasks.text_classification)
-        # loaded from modelscope dataset
-        dataset = MsDataset.load(
-            'xcopa',
-            subset_name='translation-et',
-            namespace='damotest',
-            split='test',
-            target='premise')
-        result = text_classification(dataset)
-        self.printDataset(result)
-
-    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
-    def test_demo_compatibility(self):
-        self.compatibility_check()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index 27db1f18..aa8aba5c 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -38,7 +38,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skip(
+        'Skip testing trainer repeatable, because it\'s unstable in daily UT')
     def test_trainer_repeatable(self):
         import torch  # noqa
 
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 8357e778..5b0c9982 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -169,11 +169,25 @@ class TestTrainerWithNlp(unittest.TestCase):
         cfg.preprocessor.label = 'label'
         cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
         cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
+        cfg.train.dataloader.batch_size_per_gpu = 2
+        cfg.train.hooks = [{
+            'type': 'CheckpointHook',
+            'interval': 3,
+            'by_epoch': False,
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }, {
+            'type': 'EvaluationHook',
+            'interval': 1
+        }]
         cfg.train.work_dir = self.tmp_dir
         cfg_file = os.path.join(self.tmp_dir, 'config.json')
         cfg.dump(cfg_file)
         dataset = MsDataset.load('clue', subset_name='afqmc', split='train')
-        dataset = dataset.to_hf_dataset().select(range(128))
+        dataset = dataset.to_hf_dataset().select(range(4))
         kwargs = dict(
             model=model_id,
             train_dataset=dataset,
@@ -190,7 +204,7 @@ class TestTrainerWithNlp(unittest.TestCase):
                 PRIORITY = Priority.VERY_LOW
 
                 def after_iter(self, trainer):
-                    if trainer.iter == 12:
+                    if trainer.iter == 3:
                         raise MsRegressTool.EarlyStopError('Test finished.')
 
             if 'EarlyStopHook' not in [
@@ -207,12 +221,11 @@ class TestTrainerWithNlp(unittest.TestCase):
 
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-
         trainer = build_trainer(default_args=kwargs)
         regress_tool = MsRegressTool(baseline=False)
         with regress_tool.monitor_ms_train(
                 trainer, 'trainer_continue_train', level='strict'):
-            trainer.train(os.path.join(self.tmp_dir, 'iter_12.pth'))
+            trainer.train(os.path.join(self.tmp_dir, 'iter_3.pth'))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):

From 182ba1768fefeac39907d5a4f6d7cdbbf715b6fe Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Sun, 23 Oct 2022 10:56:52 +0800
Subject: [PATCH 099/129] [to #42322933]support multi tasks for part of speech 
        Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10491994

---
 .../models/nlp/heads/token_classification_head.py     |  2 ++
 .../models/nlp/task_models/token_classification.py    |  2 ++
 modelscope/pipelines/builder.py                       |  2 ++
 .../pipelines/nlp/token_classification_pipeline.py    |  2 ++
 tests/pipelines/test_part_of_speech.py                | 11 ++++-------
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
index ace3deac..3f19ca67 100644
--- a/modelscope/models/nlp/heads/token_classification_head.py
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -14,6 +14,8 @@ from modelscope.utils.constant import Tasks
 
 @HEADS.register_module(
     Tasks.token_classification, module_name=Heads.token_classification)
+@HEADS.register_module(
+    Tasks.part_of_speech, module_name=Heads.token_classification)
 class TokenClassificationHead(TorchHead):
 
     def __init__(self, **kwargs):
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index f3930182..a39f58bf 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -19,6 +19,8 @@ __all__ = ['TokenClassificationModel']
 
 @MODELS.register_module(
     Tasks.token_classification, module_name=TaskModels.token_classification)
+@MODELS.register_module(
+    Tasks.part_of_speech, module_name=TaskModels.token_classification)
 class TokenClassificationModel(SingleBackboneTaskModelBase):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index aaea0bb6..8c81118c 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -25,6 +25,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
+    Tasks.part_of_speech: (Pipelines.part_of_speech,
+                           'damo/nlp_structbert_part-of-speech_chinese-base'),
     Tasks.token_classification:
     (Pipelines.part_of_speech,
      'damo/nlp_structbert_part-of-speech_chinese-base'),
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index c57dbf20..055a4b8a 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -18,6 +18,8 @@ __all__ = ['TokenClassificationPipeline']
 
 @PIPELINES.register_module(
     Tasks.token_classification, module_name=Pipelines.part_of_speech)
+@PIPELINES.register_module(
+    Tasks.part_of_speech, module_name=Pipelines.part_of_speech)
 class TokenClassificationPipeline(Pipeline):
 
     def __init__(self,
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
index 25f4491c..61cdfe73 100644
--- a/tests/pipelines/test_part_of_speech.py
+++ b/tests/pipelines/test_part_of_speech.py
@@ -13,7 +13,7 @@ from modelscope.utils.test_utils import test_level
 
 
 class PartOfSpeechTest(unittest.TestCase):
-    model_id = 'damo/nlp_structbert_part-of-speech_chinese-base'
+    model_id = 'damo/nlp_structbert_part-of-speech_chinese-lite'
     sentence = '今天天气不错，适合出去游玩'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
@@ -34,20 +34,17 @@ class PartOfSpeechTest(unittest.TestCase):
         model = Model.from_pretrained(self.model_id)
         tokenizer = TokenClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
-            task=Tasks.token_classification,
-            model=model,
-            preprocessor=tokenizer)
+            task=Tasks.part_of_speech, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipeline_ins = pipeline(
-            task=Tasks.token_classification, model=self.model_id)
+        pipeline_ins = pipeline(task=Tasks.part_of_speech, model=self.model_id)
         print(pipeline_ins(input=self.sentence))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.token_classification)
+        pipeline_ins = pipeline(task=Tasks.part_of_speech)
         print(pipeline_ins(input=self.sentence))
 
 
From cf831dbf986f4b926d49f1bbbba444a70f6f48b2 Mon Sep 17 00:00:00 2001
From: "yanheng.wyh" <yanheng.wyh@alibaba-inc.com>
Date: Sun, 23 Oct 2022 12:22:33 +0800
Subject: [PATCH 100/129] [to #42322933] fix cv/animalRecog output format  *
 fix output format         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10491722

---
 modelscope/pipelines/cv/animal_recognition_pipeline.py  | 5 ++---
 modelscope/pipelines/cv/general_recognition_pipeline.py | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/modelscope/pipelines/cv/animal_recognition_pipeline.py b/modelscope/pipelines/cv/animal_recognition_pipeline.py
index fad14680..671a5b4c 100644
--- a/modelscope/pipelines/cv/animal_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/animal_recognition_pipeline.py
@@ -113,9 +113,8 @@ class AnimalRecognitionPipeline(Pipeline):
             label_mapping = f.readlines()
         score = torch.max(inputs['outputs'])
         inputs = {
-            OutputKeys.SCORES:
-            score.item(),
+            OutputKeys.SCORES: [score.item()],
             OutputKeys.LABELS:
-            label_mapping[inputs['outputs'].argmax()].split('\t')[1]
+            [label_mapping[inputs['outputs'].argmax()].split('\t')[1]]
         }
         return inputs
diff --git a/modelscope/pipelines/cv/general_recognition_pipeline.py b/modelscope/pipelines/cv/general_recognition_pipeline.py
index 07222086..80f6f88a 100644
--- a/modelscope/pipelines/cv/general_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/general_recognition_pipeline.py
@@ -114,9 +114,8 @@ class GeneralRecognitionPipeline(Pipeline):
             label_mapping = f.readlines()
         score = torch.max(inputs['outputs'])
         inputs = {
-            OutputKeys.SCORES:
-            score.item(),
+            OutputKeys.SCORES: [score.item()],
             OutputKeys.LABELS:
-            label_mapping[inputs['outputs'].argmax()].split('\t')[1]
+            [label_mapping[inputs['outputs'].argmax()].split('\t')[1]]
         }
         return inputs

From 35644fa0a7b9532f05c0f6d9e54101b32f2276b5 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Sun, 23 Oct 2022 20:25:24 +0800
Subject: [PATCH 101/129] [to #42322933] change star3 to space_T_cn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 合并star和star3框架
2. 修改star和star3的model type
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10492793
---
 modelscope/metainfo.py                        |  5 +++--
 .../nlp/{star3 => space_T_cn}/__init__.py     |  0
 .../configuration_space_T_cn.py}              | 16 +++++++-------
 .../modeling_space_T_cn.py}                   | 21 ++++++++++---------
 modelscope/models/nlp/star_text_to_sql.py     |  2 +-
 .../models/nlp/table_question_answering.py    | 14 ++++++-------
 modelscope/outputs.py                         | 11 +---------
 modelscope/pipelines/builder.py               |  3 ---
 .../conversational_text_to_sql_pipeline.py    |  4 ++--
 .../nlp/table_question_answering_pipeline.py  |  7 ++++---
 modelscope/preprocessors/__init__.py          |  4 ++--
 .../{star3 => space_T_cn}/__init__.py         |  0
 .../{star3 => space_T_cn}/fields/__init__.py  |  0
 .../{star3 => space_T_cn}/fields/database.py  |  2 +-
 .../fields/schema_link.py                     |  2 +-
 .../{star3 => space_T_cn}/fields/struct.py    |  0
 .../table_question_answering_preprocessor.py  |  4 ++--
 modelscope/utils/constant.py                  |  1 -
 modelscope/utils/nlp/nlp_utils.py             |  2 +-
 .../test_conversational_text_to_sql.py        |  7 +------
 .../test_table_question_answering.py          | 13 ++++--------
 21 files changed, 48 insertions(+), 70 deletions(-)
 rename modelscope/models/nlp/{star3 => space_T_cn}/__init__.py (100%)
 rename modelscope/models/nlp/{star3/configuration_star3.py => space_T_cn/configuration_space_T_cn.py} (91%)
 rename modelscope/models/nlp/{star3/modeling_star3.py => space_T_cn/modeling_space_T_cn.py} (98%)
 rename modelscope/preprocessors/{star3 => space_T_cn}/__init__.py (100%)
 rename modelscope/preprocessors/{star3 => space_T_cn}/fields/__init__.py (100%)
 rename modelscope/preprocessors/{star3 => space_T_cn}/fields/database.py (98%)
 rename modelscope/preprocessors/{star3 => space_T_cn}/fields/schema_link.py (99%)
 rename modelscope/preprocessors/{star3 => space_T_cn}/fields/struct.py (100%)
 rename modelscope/preprocessors/{star3 => space_T_cn}/table_question_answering_preprocessor.py (96%)

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1d6fd874..913589d8 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -67,8 +67,9 @@ class Models(object):
     space_dst = 'space-dst'
     space_intent = 'space-intent'
     space_modeling = 'space-modeling'
-    star = 'star'
-    star3 = 'star3'
+    space_T_en = 'space-T-en'
+    space_T_cn = 'space-T-cn'
+
     tcrf = 'transformer-crf'
     transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
diff --git a/modelscope/models/nlp/star3/__init__.py b/modelscope/models/nlp/space_T_cn/__init__.py
similarity index 100%
rename from modelscope/models/nlp/star3/__init__.py
rename to modelscope/models/nlp/space_T_cn/__init__.py
diff --git a/modelscope/models/nlp/star3/configuration_star3.py b/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
similarity index 91%
rename from modelscope/models/nlp/star3/configuration_star3.py
rename to modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
index 4c5ae677..553d8592 100644
--- a/modelscope/models/nlp/star3/configuration_star3.py
+++ b/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
@@ -24,8 +24,8 @@ import json
 logger = logging.getLogger(__name__)
 
 
-class Star3Config(object):
-    """Configuration class to store the configuration of a `Star3Model`.
+class SpaceTCnConfig(object):
+    """Configuration class to store the configuration of a `SpaceTCnModel`.
     """
 
     def __init__(self,
@@ -40,10 +40,10 @@ class Star3Config(object):
                  max_position_embeddings=512,
                  type_vocab_size=2,
                  initializer_range=0.02):
-        """Constructs Star3Config.
+        """Constructs SpaceTCnConfig.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `Star3Model`.
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `SpaceTCnConfig`.
             hidden_size: Size of the encoder layers and the pooler layer.
             num_hidden_layers: Number of hidden layers in the Transformer encoder.
             num_attention_heads: Number of attention heads for each attention layer in
@@ -59,7 +59,7 @@ class Star3Config(object):
             max_position_embeddings: The maximum sequence length that this model might
                 ever be used with. Typically set this to something large just in case
                 (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into `Star3Model`.
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into `SpaceTCnConfig`.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
@@ -89,15 +89,15 @@ class Star3Config(object):
 
     @classmethod
     def from_dict(cls, json_object):
-        """Constructs a `Star3Config` from a Python dictionary of parameters."""
-        config = Star3Config(vocab_size_or_config_json_file=-1)
+        """Constructs a `SpaceTCnConfig` from a Python dictionary of parameters."""
+        config = SpaceTCnConfig(vocab_size_or_config_json_file=-1)
         for key, value in json_object.items():
             config.__dict__[key] = value
         return config
 
     @classmethod
     def from_json_file(cls, json_file):
-        """Constructs a `Star3Config` from a json file of parameters."""
+        """Constructs a `SpaceTCnConfig` from a json file of parameters."""
         with open(json_file, 'r', encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
diff --git a/modelscope/models/nlp/star3/modeling_star3.py b/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
similarity index 98%
rename from modelscope/models/nlp/star3/modeling_star3.py
rename to modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
index 13f7136a..72c94724 100644
--- a/modelscope/models/nlp/star3/modeling_star3.py
+++ b/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
@@ -27,7 +27,8 @@ import numpy as np
 import torch
 from torch import nn
 
-from modelscope.models.nlp.star3.configuration_star3 import Star3Config
+from modelscope.models.nlp.space_T_cn.configuration_space_T_cn import \
+    SpaceTCnConfig
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 
@@ -609,9 +610,9 @@ class PreTrainedBertModel(nn.Module):
 
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedBertModel, self).__init__()
-        if not isinstance(config, Star3Config):
+        if not isinstance(config, SpaceTCnConfig):
             raise ValueError(
-                'Parameter config in `{}(config)` should be an instance of class `Star3Config`. '
+                'Parameter config in `{}(config)` should be an instance of class `SpaceTCnConfig`. '
                 'To create a model from a Google pretrained model use '
                 '`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`'.format(
                     self.__class__.__name__, self.__class__.__name__))
@@ -676,7 +677,7 @@ class PreTrainedBertModel(nn.Module):
             serialization_dir = tempdir
         # Load config
         config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = Star3Config.from_json_file(config_file)
+        config = SpaceTCnConfig.from_json_file(config_file)
         logger.info('Model config {}'.format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
@@ -742,11 +743,11 @@ class PreTrainedBertModel(nn.Module):
         return model
 
 
-class Star3Model(PreTrainedBertModel):
-    """Star3Model model ("Bidirectional Embedding Representations from a Transformer pretrained on STAR3.0").
+class SpaceTCnModel(PreTrainedBertModel):
+    """SpaceTCnModel model ("Bidirectional Embedding Representations from a Transformer pretrained on STAR-T-CN").
 
     Params:
-        config: a Star3Config class instance with the configuration to build a new model
+        config: a SpaceTCnConfig class instance with the configuration to build a new model
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
@@ -780,16 +781,16 @@ class Star3Model(PreTrainedBertModel):
     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
-    config = modeling.Star3Config(vocab_size_or_config_json_file=32000, hidden_size=768,
+    config = modeling.SpaceTCnConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 
-    model = modeling.Star3Model(config=config)
+    model = modeling.SpaceTCnModel(config=config)
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
 
     def __init__(self, config, schema_link_module='none'):
-        super(Star3Model, self).__init__(config)
+        super(SpaceTCnModel, self).__init__(config)
         self.embeddings = BertEmbeddings(config)
         self.encoder = BertEncoder(
             config, schema_link_module=schema_link_module)
diff --git a/modelscope/models/nlp/star_text_to_sql.py b/modelscope/models/nlp/star_text_to_sql.py
index eef76e8a..089f1c89 100644
--- a/modelscope/models/nlp/star_text_to_sql.py
+++ b/modelscope/models/nlp/star_text_to_sql.py
@@ -20,7 +20,7 @@ __all__ = ['StarForTextToSql']
 
 
 @MODELS.register_module(
-    Tasks.conversational_text_to_sql, module_name=Models.star)
+    Tasks.table_question_answering, module_name=Models.space_T_en)
 class StarForTextToSql(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py
index c2134df2..8e05dd0f 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/table_question_answering.py
@@ -3,27 +3,25 @@
 import os
 from typing import Dict
 
-import json
 import numpy
 import torch
 import torch.nn.functional as F
-import tqdm
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Model, Tensor
 from modelscope.models.builder import MODELS
-from modelscope.models.nlp.star3.configuration_star3 import Star3Config
-from modelscope.models.nlp.star3.modeling_star3 import Seq2SQL, Star3Model
-from modelscope.preprocessors.star3.fields.struct import Constant
+from modelscope.preprocessors.space_T_cn.fields.struct import Constant
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import verify_device
+from .space_T_cn.configuration_space_T_cn import SpaceTCnConfig
+from .space_T_cn.modeling_space_T_cn import Seq2SQL, SpaceTCnModel
 
 __all__ = ['TableQuestionAnswering']
 
 
 @MODELS.register_module(
-    Tasks.table_question_answering, module_name=Models.star3)
+    Tasks.table_question_answering, module_name=Models.space_T_cn)
 class TableQuestionAnswering(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -43,9 +41,9 @@ class TableQuestionAnswering(Model):
             os.path.join(self.model_dir, ModelFile.TORCH_MODEL_BIN_FILE),
             map_location='cpu')
 
-        self.backbone_config = Star3Config.from_json_file(
+        self.backbone_config = SpaceTCnConfig.from_json_file(
             os.path.join(self.model_dir, ModelFile.CONFIGURATION))
-        self.backbone_model = Star3Model(
+        self.backbone_model = SpaceTCnModel(
             config=self.backbone_config, schema_link_module='rat')
         self.backbone_model.load_state_dict(state_dict['backbone_model'])
 
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index 13d440ca..34bde76a 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -606,21 +606,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.task_oriented_conversation: [OutputKeys.OUTPUT],
 
-    # conversational text-to-sql result for single sample
-    # {
-    #   "text": "SELECT shop.Name FROM shop."
-    # }
-    Tasks.conversational_text_to_sql: [OutputKeys.TEXT],
-
     # table-question-answering result for single sample
     # {
     #   "sql": "SELECT shop.Name FROM shop."
     #   "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
     # }
-    Tasks.table_question_answering: [
-        OutputKeys.SQL_STRING, OutputKeys.SQL_QUERY, OutputKeys.HISTORY,
-        OutputKeys.QUERT_RESULT
-    ],
+    Tasks.table_question_answering: [OutputKeys.OUTPUT],
 
     # ============ audio tasks ===================
     # asr result for single sample
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 8c81118c..e1583387 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -69,9 +69,6 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                        'damo/nlp_space_dialog-modeling'),
     Tasks.dialog_state_tracking: (Pipelines.dialog_state_tracking,
                                   'damo/nlp_space_dialog-state-tracking'),
-    Tasks.conversational_text_to_sql:
-    (Pipelines.conversational_text_to_sql,
-     'damo/nlp_star_conversational-text-to-sql'),
     Tasks.table_question_answering:
     (Pipelines.table_question_answering_pipeline,
      'damo/nlp-convai-text2sql-pretrain-cn'),
diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index c46e8c81..73c6429d 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -19,7 +19,7 @@ __all__ = ['ConversationalTextToSqlPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.conversational_text_to_sql,
+    Tasks.table_question_answering,
     module_name=Pipelines.conversational_text_to_sql)
 class ConversationalTextToSqlPipeline(Pipeline):
 
@@ -62,7 +62,7 @@ class ConversationalTextToSqlPipeline(Pipeline):
             Dict[str, str]: the prediction results
         """
         sql = Example.evaluator.obtain_sql(inputs['predict'][0], inputs['db'])
-        result = {OutputKeys.TEXT: sql}
+        result = {OutputKeys.OUTPUT: {OutputKeys.TEXT: sql}}
         return result
 
     def _collate_fn(self, data):
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 08501953..52ba33e0 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -13,8 +13,9 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
-from modelscope.preprocessors.star3.fields.database import Database
-from modelscope.preprocessors.star3.fields.struct import Constant, SQLQuery
+from modelscope.preprocessors.space_T_cn.fields.database import Database
+from modelscope.preprocessors.space_T_cn.fields.struct import (Constant,
+                                                               SQLQuery)
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = ['TableQuestionAnsweringPipeline']
@@ -320,7 +321,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
             OutputKeys.QUERT_RESULT: tabledata,
         }
 
-        return output
+        return {OutputKeys.OUTPUT: output}
 
     def _collate_fn(self, data):
         return data
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 63302aa7..423b3f46 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -40,7 +40,7 @@ if TYPE_CHECKING:
                         DialogStateTrackingPreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
     from .star import ConversationalTextToSqlPreprocessor
-    from .star3 import TableQuestionAnsweringPreprocessor
+    from .space_T_cn import TableQuestionAnsweringPreprocessor
 
 else:
     _import_structure = {
@@ -81,7 +81,7 @@ else:
             'DialogStateTrackingPreprocessor', 'InputFeatures'
         ],
         'star': ['ConversationalTextToSqlPreprocessor'],
-        'star3': ['TableQuestionAnsweringPreprocessor'],
+        'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/star3/__init__.py b/modelscope/preprocessors/space_T_cn/__init__.py
similarity index 100%
rename from modelscope/preprocessors/star3/__init__.py
rename to modelscope/preprocessors/space_T_cn/__init__.py
diff --git a/modelscope/preprocessors/star3/fields/__init__.py b/modelscope/preprocessors/space_T_cn/fields/__init__.py
similarity index 100%
rename from modelscope/preprocessors/star3/fields/__init__.py
rename to modelscope/preprocessors/space_T_cn/fields/__init__.py
diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/space_T_cn/fields/database.py
similarity index 98%
rename from modelscope/preprocessors/star3/fields/database.py
rename to modelscope/preprocessors/space_T_cn/fields/database.py
index 5debfe2c..481bd1db 100644
--- a/modelscope/preprocessors/star3/fields/database.py
+++ b/modelscope/preprocessors/space_T_cn/fields/database.py
@@ -4,7 +4,7 @@ import sqlite3
 import json
 import tqdm
 
-from modelscope.preprocessors.star3.fields.struct import Trie
+from modelscope.preprocessors.space_T_cn.fields.struct import Trie
 
 
 class Database:
diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/space_T_cn/fields/schema_link.py
similarity index 99%
rename from modelscope/preprocessors/star3/fields/schema_link.py
rename to modelscope/preprocessors/space_T_cn/fields/schema_link.py
index 220a71d8..4b8f9d31 100644
--- a/modelscope/preprocessors/star3/fields/schema_link.py
+++ b/modelscope/preprocessors/space_T_cn/fields/schema_link.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
 
-from modelscope.preprocessors.star3.fields.struct import TypeInfo
+from modelscope.preprocessors.space_T_cn.fields.struct import TypeInfo
 
 
 class SchemaLinker:
diff --git a/modelscope/preprocessors/star3/fields/struct.py b/modelscope/preprocessors/space_T_cn/fields/struct.py
similarity index 100%
rename from modelscope/preprocessors/star3/fields/struct.py
rename to modelscope/preprocessors/space_T_cn/fields/struct.py
diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
similarity index 96%
rename from modelscope/preprocessors/star3/table_question_answering_preprocessor.py
rename to modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
index ed2911f6..63e6fd57 100644
--- a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
@@ -8,8 +8,8 @@ from transformers import BertTokenizer
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.star3.fields.database import Database
-from modelscope.preprocessors.star3.fields.schema_link import SchemaLinker
+from modelscope.preprocessors.space_T_cn.fields.database import Database
+from modelscope.preprocessors.space_T_cn.fields.schema_link import SchemaLinker
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 87a0a417..50a1c016 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -123,7 +123,6 @@ class NLPTasks(object):
     backbone = 'backbone'
     text_error_correction = 'text-error-correction'
     faq_question_answering = 'faq-question-answering'
-    conversational_text_to_sql = 'conversational-text-to-sql'
     information_extraction = 'information-extraction'
     document_segmentation = 'document-segmentation'
     feature_extraction = 'feature-extraction'
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py
index 35b374f2..bfeaf924 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/nlp_utils.py
@@ -20,7 +20,7 @@ def text2sql_tracking_and_print_results(
             results = p(case)
             print({'question': item})
             print(results)
-            last_sql = results['text']
+            last_sql = results[OutputKeys.OUTPUT][OutputKeys.TEXT]
             history.append(item)
 
 
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 80c72337..21a4e0ce 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -16,7 +16,7 @@ from modelscope.utils.test_utils import test_level
 class ConversationalTextToSql(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.conversational_text_to_sql
+        self.task = Tasks.table_question_answering
         self.model_id = 'damo/nlp_star_conversational-text-to-sql'
 
     model_id = 'damo/nlp_star_conversational-text-to-sql'
@@ -66,11 +66,6 @@ class ConversationalTextToSql(unittest.TestCase, DemoCompatibilityCheck):
         pipelines = [pipeline(task=self.task, model=self.model_id)]
         text2sql_tracking_and_print_results(self.test_case, pipelines)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_default_model(self):
-        pipelines = [pipeline(task=self.task)]
-        text2sql_tracking_and_print_results(self.test_case, pipelines)
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_demo_compatibility(self):
         self.compatibility_check()
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 571ca795..828ef5ac 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -12,7 +12,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
-from modelscope.preprocessors.star3.fields.database import Database
+from modelscope.preprocessors.space_T_cn.fields.database import Database
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.test_utils import test_level
 
@@ -38,7 +38,7 @@ def tableqa_tracking_and_print_results_with_history(
             output_dict = p({
                 'question': question,
                 'history_sql': historical_queries
-            })
+            })[OutputKeys.OUTPUT]
             print('question', question)
             print('sql text:', output_dict[OutputKeys.SQL_STRING])
             print('sql query:', output_dict[OutputKeys.SQL_QUERY])
@@ -61,7 +61,7 @@ def tableqa_tracking_and_print_results_without_history(
     }
     for p in pipelines:
         for question in test_case['utterance']:
-            output_dict = p({'question': question})
+            output_dict = p({'question': question})[OutputKeys.OUTPUT]
             print('question', question)
             print('sql text:', output_dict[OutputKeys.SQL_STRING])
             print('sql query:', output_dict[OutputKeys.SQL_QUERY])
@@ -92,7 +92,7 @@ def tableqa_tracking_and_print_results_with_tableid(
                 'question': question,
                 'table_id': table_id,
                 'history_sql': historical_queries
-            })
+            })[OutputKeys.OUTPUT]
             print('question', question)
             print('sql text:', output_dict[OutputKeys.SQL_STRING])
             print('sql query:', output_dict[OutputKeys.SQL_QUERY])
@@ -147,11 +147,6 @@ class TableQuestionAnswering(unittest.TestCase):
         ]
         tableqa_tracking_and_print_results_with_tableid(pipelines)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_from_task(self):
-        pipelines = [pipeline(Tasks.table_question_answering, self.model_id)]
-        tableqa_tracking_and_print_results_with_history(pipelines)
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub_with_other_classes(self):
         model = Model.from_pretrained(self.model_id)

From 8b28b725ee2f8a1289a13816cd1ec6198bf3ce81 Mon Sep 17 00:00:00 2001
From: "leyuan.hjy" <leyuan.hjy@alibaba-inc.com>
Date: Mon, 24 Oct 2022 14:54:42 +0800
Subject: [PATCH 102/129] [to #42322933] video detecor support output with
 timestamp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

适配demoservice，增加视频时间戳输出，每一个结果对应一个时间戳
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10496879
---
 .../realtime_object_detection/realtime_video_detector.py | 8 ++++++--
 modelscope/models/cv/realtime_object_detection/utils.py  | 9 +++++++++
 .../cv/realtime_video_object_detection_pipeline.py       | 6 ++++--
 3 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 modelscope/models/cv/realtime_object_detection/utils.py

diff --git a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
index fc7339b3..3830fb42 100644
--- a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
+++ b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py
@@ -16,6 +16,7 @@ from modelscope.models.builder import MODELS
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
+from .utils import timestamp_format
 from .yolox.data.data_augment import ValTransform
 from .yolox.exp import get_exp_by_name
 from .yolox.utils import postprocess
@@ -99,14 +100,17 @@ class RealtimeVideoDetector(TorchModel):
     def inference_video(self, v_path):
         outputs = []
         desc = 'Detecting video: {}'.format(v_path)
-        for frame, result in tqdm(
-                self.inference_video_iter(v_path), desc=desc):
+        for frame_idx, (frame, result) in enumerate(
+                tqdm(self.inference_video_iter(v_path), desc=desc)):
+            result = result + (timestamp_format(seconds=frame_idx
+                                                / self.fps), )
             outputs.append(result)
 
         return outputs
 
     def inference_video_iter(self, v_path):
         capture = cv2.VideoCapture(v_path)
+        self.fps = capture.get(cv2.CAP_PROP_FPS)
         while capture.isOpened():
             ret, frame = capture.read()
             if not ret:
diff --git a/modelscope/models/cv/realtime_object_detection/utils.py b/modelscope/models/cv/realtime_object_detection/utils.py
new file mode 100644
index 00000000..c3d7a4c6
--- /dev/null
+++ b/modelscope/models/cv/realtime_object_detection/utils.py
@@ -0,0 +1,9 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+
+def timestamp_format(seconds):
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    time = '%02d:%02d:%06.3f' % (h, m, s)
+    return time
diff --git a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
index 3686c50a..073fad66 100644
--- a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
+++ b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py
@@ -45,15 +45,17 @@ class RealtimeVideoObjectDetectionPipeline(Pipeline):
                     **kwargs) -> str:
         forward_output = input['forward_output']
 
-        scores, boxes, labels = [], [], []
+        scores, boxes, labels, timestamps = [], [], [], []
         for result in forward_output:
-            box, score, label = result
+            box, score, label, timestamp = result
             scores.append(score)
             boxes.append(box)
             labels.append(label)
+            timestamps.append(timestamp)
 
         return {
             OutputKeys.BOXES: boxes,
             OutputKeys.SCORES: scores,
             OutputKeys.LABELS: labels,
+            OutputKeys.TIMESTAMPS: timestamps,
         }

From 7257f6c6fb9211ad3b671fedac065a40f9120696 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Mon, 24 Oct 2022 15:12:48 +0800
Subject: [PATCH 103/129] [to #45631658]feat support eas deploy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

服务端文档链接(可能需要登录)：
https://test.modelscope.cn/api/v1/deployer/docs
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10478609
---
 modelscope/hub/api.py      | 173 ++++++++++++++++++++++++++++++++-
 modelscope/hub/deploy.py   | 189 +++++++++++++++++++++++++++++++++++++
 modelscope/hub/errors.py   |   5 +
 requirements/framework.txt |   1 +
 4 files changed, 366 insertions(+), 2 deletions(-)
 create mode 100644 modelscope/hub/deploy.py

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index f8ca683a..fd40abdf 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -12,6 +12,7 @@ from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import List, Optional, Tuple, Union
 
+import attrs
 import requests
 
 from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
@@ -21,9 +22,14 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH, Licenses,
                                       ModelVisibility)
+from modelscope.hub.deploy import (DeleteServiceParameters,
+                                   DeployServiceParameters,
+                                   GetServiceParameters, ListServiceParameters,
+                                   ServiceParameters, ServiceResourceConfig,
+                                   Vendor)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
-                                   NotLoginException, RequestError,
-                                   datahub_raise_on_error,
+                                   NotLoginException, NotSupportError,
+                                   RequestError, datahub_raise_on_error,
                                    handle_http_post_error,
                                    handle_http_response, is_ok, raise_on_error)
 from modelscope.hub.git import GitCommandWrapper
@@ -306,6 +312,169 @@ class HubApi:
             r.raise_for_status()
         return None
 
+    def deploy_model(self, model_id: str, revision: str, instance_name: str,
+                     resource: ServiceResourceConfig,
+                     provider: ServiceParameters):
+        """Deploy model to cloud, current we only support PAI EAS, this is asynchronous
+        call , please check instance status through the console or query the instance status.
+        At the same time, this call may take a long time.
+
+        Args:
+            model_id (str): The deployed model id
+            revision (str): The model revision
+            instance_name (str): The deployed model instance name.
+            resource (DeployResource): The resource information.
+            provider (CreateParameter): The cloud service provider parameter
+
+        Raises:
+            NotLoginException: To use this api, you need login first.
+            NotSupportError: Not supported platform.
+            RequestError: The server return error.
+
+        Returns:
+            InstanceInfo: The instance information.
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise NotLoginException(
+                'Token does not exist, please login first.')
+        if provider.vendor != Vendor.EAS:
+            raise NotSupportError(
+                'Not support vendor: %s ,only support EAS current.' %
+                (provider.vendor))
+        create_params = DeployServiceParameters(
+            instance_name=instance_name,
+            model_id=model_id,
+            revision=revision,
+            resource=resource,
+            provider=provider)
+        path = f'{self.endpoint}/api/v1/deployer/endpoint'
+        body = attrs.asdict(create_params)
+        r = requests.post(
+            path,
+            json=body,
+            cookies=cookies,
+        )
+        handle_http_response(r, logger, cookies, 'create_eas_instance')
+        if r.status_code >= HTTPStatus.OK and r.status_code < HTTPStatus.MULTIPLE_CHOICES:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
+    def list_deployed_model_instances(self,
+                                      provider: ServiceParameters,
+                                      skip: int = 0,
+                                      limit: int = 100):
+        """List deployed model instances.
+
+        Args:
+            provider (ListServiceParameter): The cloud service provider parameter,
+            for eas, need access_key_id and access_key_secret.
+            skip: start of the list, current not support.
+            limit: maximum number of instances return, current not support
+        Raises:
+            NotLoginException: To use this api, you need login first.
+            RequestError: The request is failed from server.
+
+        Returns:
+            List: List of instance information
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise NotLoginException(
+                'Token does not exist, please login first.')
+        params = ListServiceParameters(
+            provider=provider, skip=skip, limit=limit)
+        path = '%s/api/v1/deployer/endpoint?%s' % (self.endpoint,
+                                                   params.to_query_str())
+        r = requests.get(path, cookies=cookies)
+        handle_http_response(r, logger, cookies, 'list_deployed_model')
+        if r.status_code == HTTPStatus.OK:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
+    def get_deployed_model_instance(self, instance_name: str,
+                                    provider: ServiceParameters):
+        """Query the specified instance information.
+
+        Args:
+            instance_name (str): The deployed instance name.
+            provider (GetParameter): The cloud provider information, for eas
+            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
+
+        Raises:
+            NotLoginException: To use this api, you need login first.
+            RequestError: The request is failed from server.
+
+        Returns:
+            Dict: The request instance information
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise NotLoginException(
+                'Token does not exist, please login first.')
+        params = GetServiceParameters(provider=provider)
+        path = '%s/api/v1/deployer/endpoint/%s?%s' % (
+            self.endpoint, instance_name, params.to_query_str())
+        r = requests.get(path, cookies=cookies)
+        handle_http_response(r, logger, cookies, 'get_deployed_model')
+        if r.status_code == HTTPStatus.OK:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
+    def delete_deployed_model_instance(self, instance_name: str,
+                                       provider: ServiceParameters):
+        """Delete deployed model, this api send delete command and return, it will take
+        some to delete, please check through the cloud console.
+
+        Args:
+            instance_name (str): The instance name you want to delete.
+            provider (DeleteParameter): The cloud provider information, for eas
+            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
+
+        Raises:
+            NotLoginException: To call this api, you need login first.
+            RequestError: The request is failed.
+
+        Returns:
+            Dict: The deleted instance information.
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise NotLoginException(
+                'Token does not exist, please login first.')
+        params = DeleteServiceParameters(provider=provider)
+        path = '%s/api/v1/deployer/endpoint/%s?%s' % (
+            self.endpoint, instance_name, params.to_query_str())
+        r = requests.delete(path, cookies=cookies)
+        handle_http_response(r, logger, cookies, 'delete_deployed_model')
+        if r.status_code == HTTPStatus.OK:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
     def _check_cookie(self,
                       use_cookies: Union[bool,
                                          CookieJar] = False) -> CookieJar:
diff --git a/modelscope/hub/deploy.py b/modelscope/hub/deploy.py
new file mode 100644
index 00000000..64594e0d
--- /dev/null
+++ b/modelscope/hub/deploy.py
@@ -0,0 +1,189 @@
+import urllib
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+
+import json
+from attr import fields
+from attrs import asdict, define, field, validators
+
+
+class Accelerator(object):
+    CPU = 'cpu'
+    GPU = 'gpu'
+
+
+class Vendor(object):
+    EAS = 'eas'
+
+
+class EASRegion(object):
+    beijing = 'cn-beijing'
+    hangzhou = 'cn-hangzhou'
+
+
+class EASCpuInstanceType(object):
+    """EAS Cpu Instance TYpe, ref(https://help.aliyun.com/document_detail/144261.html)
+    """
+    tiny = 'ecs.c6.2xlarge'
+    small = 'ecs.c6.4xlarge'
+    medium = 'ecs.c6.6xlarge'
+    large = 'ecs.c6.8xlarge'
+
+
+class EASGpuInstanceType(object):
+    """EAS Cpu Instance TYpe, ref(https://help.aliyun.com/document_detail/144261.html)
+    """
+    tiny = 'ecs.gn5-c28g1.7xlarge'
+    small = 'ecs.gn5-c8g1.4xlarge'
+    medium = 'ecs.gn6i-c24g1.12xlarge'
+    large = 'ecs.gn6e-c12g1.3xlarge'
+
+
+def min_smaller_than_max(instance, attribute, value):
+    if value > instance.max_replica:
+        raise ValueError(
+            "'min_replica' value: %s has to be smaller than 'max_replica' value: %s!"
+            % (value, instance.max_replica))
+
+
+@define
+class ServiceScalingConfig(object):
+    """Resource scaling config
+       Currently we ignore max_replica
+    Args:
+        max_replica: maximum replica
+        min_replica: minimum replica
+    """
+    max_replica: int = field(default=1, validator=validators.ge(1))
+    min_replica: int = field(
+        default=1, validator=[validators.ge(1), min_smaller_than_max])
+
+
+@define
+class ServiceResourceConfig(object):
+    """Eas Resource request.
+
+    Args:
+        accelerator: the accelerator(cpu|gpu)
+        instance_type: the instance type.
+        scaling: The instance scaling config.
+    """
+    instance_type: str
+    scaling: ServiceScalingConfig
+    accelerator: str = field(
+        default=Accelerator.CPU,
+        validator=validators.in_([Accelerator.CPU, Accelerator.GPU]))
+
+
+@define
+class ServiceParameters(ABC):
+    pass
+
+
+@define
+class EASDeployParameters(ServiceParameters):
+    """Parameters for EAS Deployment.
+
+    Args:
+        resource_group: the resource group to deploy, current default.
+        region: The eas instance region(eg: cn-hangzhou).
+        access_key_id: The eas account access key id.
+        access_key_secret: The eas account access key secret.
+        vendor: must be 'eas'
+    """
+    region: str
+    access_key_id: str
+    access_key_secret: str
+    resource_group: Optional[str] = None
+    vendor: str = field(
+        default=Vendor.EAS, validator=validators.in_([Vendor.EAS]))
+    """
+    def __init__(self,
+                 instance_name: str,
+                 access_key_id: str,
+                 access_key_secret: str,
+                 region = EASRegion.beijing,
+                 instance_type: str  = EASCpuInstances.small,
+                 accelerator: str =  Accelerator.CPU,
+                 resource_group: Optional[str] = None,
+                 scaling: Optional[str] = None):
+        self.instance_name=instance_name
+        self.access_key_id=self.access_key_id
+        self.access_key_secret = access_key_secret
+        self.region = region
+        self.instance_type = instance_type
+        self.accelerator = accelerator
+        self.resource_group = resource_group
+        self.scaling = scaling
+    """
+
+
+@define
+class EASListParameters(ServiceParameters):
+    """EAS instance list parameters.
+
+    Args:
+        resource_group: the resource group to deploy, current default.
+        region: The eas instance region(eg: cn-hangzhou).
+        access_key_id: The eas account access key id.
+        access_key_secret: The eas account access key secret.
+        vendor: must be 'eas'
+    """
+    access_key_id: str
+    access_key_secret: str
+    region: str = None
+    resource_group: str = None
+    vendor: str = field(
+        default=Vendor.EAS, validator=validators.in_([Vendor.EAS]))
+
+
+@define
+class DeployServiceParameters(object):
+    """Deploy service parameters
+
+    Args:
+        instance_name: the name of the service.
+        model_id: the modelscope model_id
+        revision: the modelscope model revision
+        resource: the resource requirement.
+        provider: the cloud service provider.
+    """
+    instance_name: str
+    model_id: str
+    revision: str
+    resource: ServiceResourceConfig
+    provider: ServiceParameters
+
+
+class AttrsToQueryString(ABC):
+    """Convert the attrs class to json string.
+
+    Args:
+    """
+
+    def to_query_str(self):
+        self_dict = asdict(
+            self.provider, filter=lambda attr, value: value is not None)
+        json_str = json.dumps(self_dict)
+        print(json_str)
+        safe_str = urllib.parse.quote_plus(json_str)
+        print(safe_str)
+        query_param = 'provider=%s' % safe_str
+        return query_param
+
+
+@define
+class ListServiceParameters(AttrsToQueryString):
+    provider: ServiceParameters
+    skip: int = 0
+    limit: int = 100
+
+
+@define
+class GetServiceParameters(AttrsToQueryString):
+    provider: ServiceParameters
+
+
+@define
+class DeleteServiceParameters(AttrsToQueryString):
+    provider: ServiceParameters
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index bd7a20ac..47994bc6 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -9,6 +9,10 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
+class NotSupportError(Exception):
+    pass
+
+
 class NotExistError(Exception):
     pass
 
@@ -66,6 +70,7 @@ def handle_http_response(response, logger, cookies, model_id):
             logger.error(
                 f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                 private. Please login first.')
+        logger.error('Response details: %s' % response.content)
         raise error
 
 
diff --git a/requirements/framework.txt b/requirements/framework.txt
index b51faeda..2408cda6 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,4 +1,5 @@
 addict
+attrs
 datasets
 easydict
 einops

From e223c1b00825cbdf66d60f0643224987e9e14232 Mon Sep 17 00:00:00 2001
From: "ashui.cbh" <ashui.cbh@alibaba-inc.com>
Date: Mon, 24 Oct 2022 18:47:01 +0800
Subject: [PATCH 104/129] [to #42322933]merge master after demo service support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

demo service 对接，修改输入接口为可调用的方式
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10502169
---
 modelscope/pipeline_inputs.py                  |  4 ++++
 .../pipelines/cv/image_inpainting_pipeline.py  | 17 +++++++++--------
 tests/pipelines/test_image_inpainting.py       | 18 ++++++++----------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index 77940c3c..13560229 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -91,6 +91,10 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.crowd_counting:
     InputType.IMAGE,
+    Tasks.image_inpainting: {
+        'img': InputType.IMAGE,
+        'mask': InputType.IMAGE,
+    },
 
     # image generation task result for a single image
     Tasks.image_to_image_generation:
diff --git a/modelscope/pipelines/cv/image_inpainting_pipeline.py b/modelscope/pipelines/cv/image_inpainting_pipeline.py
index 6ae0d63e..aff9788d 100644
--- a/modelscope/pipelines/cv/image_inpainting_pipeline.py
+++ b/modelscope/pipelines/cv/image_inpainting_pipeline.py
@@ -77,21 +77,22 @@ class ImageInpaintingPipeline(Pipeline):
             img, ((0, 0), (0, out_height - height), (0, out_width - width)),
             mode='symmetric')
 
-    def preprocess(self, input: Input) -> Dict[str, Any]:
-        if isinstance(input, str):
-            image_name, mask_name = input.split('+')
+    def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        if isinstance(input['img'], str):
+            image_name, mask_name = input['img'], input['mask']
             img = LoadImage.convert_to_ndarray(image_name)
             img = self.transforms(img)
             mask = np.array(LoadImage(mode='L')(mask_name)['img'])
             mask = self.transforms(mask)
-        elif isinstance(input, PIL.Image.Image):
-            img = input.crop((0, 0, int(input.width / 2), input.height))
+        elif isinstance(input['img'], PIL.Image.Image):
+            img = input['img']
             img = self.transforms(np.array(img))
-            mask = input.crop((int(input.width / 2), 0, input.width,
-                               input.height)).convert('L')
+            mask = input['mask'].convert('L')
             mask = self.transforms(np.array(mask))
         else:
-            raise TypeError('input should be either str or PIL.Image')
+            raise TypeError(
+                'input should be either str or PIL.Image, and both inputs should have the same type'
+            )
         result = dict(image=img, mask=mask[None, ...])
 
         if self.pad_out_to_modulo is not None and self.pad_out_to_modulo > 1:
diff --git a/tests/pipelines/test_image_inpainting.py b/tests/pipelines/test_image_inpainting.py
index b89ce399..a8b704b7 100644
--- a/tests/pipelines/test_image_inpainting.py
+++ b/tests/pipelines/test_image_inpainting.py
@@ -20,6 +20,10 @@ class ImageInpaintingTest(unittest.TestCase):
         self.input_location = 'data/test/images/image_inpainting/image_inpainting.png'
         self.input_mask_location = 'data/test/images/image_inpainting/image_inpainting_mask.png'
         self.model_id = 'damo/cv_fft_inpainting_lama'
+        self.input = {
+            'img': self.input_location,
+            'mask': self.input_mask_location
+        }
 
     def save_result(self, result):
         vis_img = result[OutputKeys.OUTPUT_IMG]
@@ -28,8 +32,7 @@ class ImageInpaintingTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_inpainting(self):
         inpainting = pipeline(Tasks.image_inpainting, model=self.model_id)
-        result = inpainting(self.input_location + '+'
-                            + self.input_mask_location)
+        result = inpainting(self.input)
         if result:
             self.save_result(result)
         else:
@@ -41,8 +44,7 @@ class ImageInpaintingTest(unittest.TestCase):
         # if input image is HR, set refine=True is more better
         inpainting = pipeline(
             Tasks.image_inpainting, model=self.model_id, refine=True)
-        result = inpainting(self.input_location + '+'
-                            + self.input_mask_location)
+        result = inpainting(self.input)
         if result:
             self.save_result(result)
         else:
@@ -53,10 +55,7 @@ class ImageInpaintingTest(unittest.TestCase):
         inpainting = pipeline(Tasks.image_inpainting, model=self.model_id)
         img = Image.open(self.input_location).convert('RGB')
         mask = Image.open(self.input_mask_location).convert('RGB')
-        img_new = Image.new('RGB', (img.width + mask.width, img.height))
-        img_new.paste(img, (0, 0))
-        img_new.paste(mask, (img.width, 0))
-        result = inpainting(img_new)
+        result = inpainting({'img': img, 'mask': mask})
         if result:
             self.save_result(result)
         else:
@@ -65,8 +64,7 @@ class ImageInpaintingTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_inpainting_with_default_task(self):
         inpainting = pipeline(Tasks.image_inpainting)
-        result = inpainting(self.input_location + '+'
-                            + self.input_mask_location)
+        result = inpainting(self.input)
         if result:
             self.save_result(result)
         else:

From 35c612a64276c77fa50239ebe40bb6b977655250 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Mon, 24 Oct 2022 23:40:38 +0800
Subject: [PATCH 105/129] =?UTF-8?q?[to=20#42322933]=E5=8E=BB=E9=99=A4clip?=
 =?UTF-8?q?=20ut=E4=B8=AD=E7=9A=84dev=20revision=20=20=20=20=20=20=20=20?=
 =?UTF-8?q?=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/coder?=
 =?UTF-8?q?eview/10507748?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    * remove clip ut dev revision
---
 tests/pipelines/test_multi_modal_embedding.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index 23954c27..ee9cdb1f 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -19,14 +19,11 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         self.model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
 
     test_input = {'text': '皮卡丘'}
-    model_version = 'dev'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
         pipeline_multi_modal_embedding = pipeline(
-            Tasks.multi_modal_embedding,
-            model=self.model_id,
-            model_revision=self.model_version)
+            Tasks.multi_modal_embedding, model=self.model_id)
         text_embedding = pipeline_multi_modal_embedding(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(
@@ -36,8 +33,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(
-            self.model_id, revision=self.model_version)
+        model = Model.from_pretrained(self.model_id)
         pipeline_multi_modal_embedding = pipeline(
             task=Tasks.multi_modal_embedding, model=model)
         text_embedding = pipeline_multi_modal_embedding(
@@ -50,8 +46,7 @@ class MultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding,
-            model_revision=self.model_version)
+            task=Tasks.multi_modal_embedding)
         text_embedding = pipeline_multi_modal_embedding(
             self.test_input)[OutputKeys.TEXT_EMBEDDING]
         print('l1-norm: {}'.format(

From c4dbb69d6538885352b6999f416ef30f55b34ae7 Mon Sep 17 00:00:00 2001
From: "zhangyanzhao.zyz" <zhangyanzhao.zyz@alibaba-inc.com>
Date: Mon, 24 Oct 2022 23:41:20 +0800
Subject: [PATCH 106/129] =?UTF-8?q?[to=20#42322933]=E5=A2=9E=E5=8A=A0?=
 =?UTF-8?q?=E5=AF=B9text-ranking=E4=BB=BB=E5=8A=A1=E4=B8=AD=E6=96=87?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=9A=84=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=EF=BC=8C=E4=BB=A5=E6=96=B9=E4=BE=BF=E5=BE=97=E5=88=B0=E5=AE=98?=
 =?UTF-8?q?=E6=96=B9=E6=A8=A1=E5=9E=8B=E6=89=93=E6=A0=87=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

增加对text-ranking任务中文模型的单元测试，以方便得到官方模型打标。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10492754
---
 tests/pipelines/test_text_ranking.py         | 43 +++++++-----
 tests/trainers/test_finetune_text_ranking.py | 72 +++++++++++++++++++-
 2 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py
index ece3c617..57fa809c 100644
--- a/tests/pipelines/test_text_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -13,7 +13,11 @@ from modelscope.utils.test_utils import test_level
 
 
 class TextRankingTest(unittest.TestCase):
-    model_id = 'damo/nlp_corom_passage-ranking_english-base'
+    models = [
+        'damo/nlp_corom_passage-ranking_english-base',
+        'damo/nlp_rom_passage-ranking_chinese-base'
+    ]
+
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
         'sentences_to_compare': [
@@ -26,29 +30,32 @@ class TextRankingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id)
-        tokenizer = TextRankingPreprocessor(cache_path)
-        model = TextRanking.from_pretrained(cache_path)
-        pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
-        pipeline2 = pipeline(
-            Tasks.text_ranking, model=model, preprocessor=tokenizer)
-        print(f'sentence: {self.inputs}\n'
-              f'pipeline1:{pipeline1(input=self.inputs)}')
-        print()
-        print(f'pipeline2: {pipeline2(input=self.inputs)}')
+        for model_id in self.models:
+            cache_path = snapshot_download(model_id)
+            tokenizer = TextRankingPreprocessor(cache_path)
+            model = TextRanking.from_pretrained(cache_path)
+            pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
+            pipeline2 = pipeline(
+                Tasks.text_ranking, model=model, preprocessor=tokenizer)
+            print(f'sentence: {self.inputs}\n'
+                  f'pipeline1:{pipeline1(input=self.inputs)}')
+            print()
+            print(f'pipeline2: {pipeline2(input=self.inputs)}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        tokenizer = TextRankingPreprocessor(model.model_dir)
-        pipeline_ins = pipeline(
-            task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
-        print(pipeline_ins(input=self.inputs))
+        for model_id in self.models:
+            model = Model.from_pretrained(model_id)
+            tokenizer = TextRankingPreprocessor(model.model_dir)
+            pipeline_ins = pipeline(
+                task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
+            print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipeline_ins = pipeline(task=Tasks.text_ranking, model=self.model_id)
-        print(pipeline_ins(input=self.inputs))
+        for model_id in self.models:
+            pipeline_ins = pipeline(task=Tasks.text_ranking, model=model_id)
+            print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
diff --git a/tests/trainers/test_finetune_text_ranking.py b/tests/trainers/test_finetune_text_ranking.py
index e603bff2..3561cb46 100644
--- a/tests/trainers/test_finetune_text_ranking.py
+++ b/tests/trainers/test_finetune_text_ranking.py
@@ -14,6 +14,7 @@ from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
 
 
 class TestFinetuneSequenceClassification(unittest.TestCase):
@@ -58,6 +59,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_finetune_msmarco(self):
 
         def cfg_modify_fn(cfg):
@@ -70,7 +72,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'query_sequence': 'query',
                     'pos_sequence': 'positive_passages',
                     'neg_sequence': 'negative_passages',
-                    'passage_text_fileds': ['title', 'text'],
+                    'text_fileds': ['title', 'text'],
                     'qid_field': 'query_id'
                 },
                 'val': {
@@ -78,7 +80,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'query_sequence': 'query',
                     'pos_sequence': 'positive_passages',
                     'neg_sequence': 'negative_passages',
-                    'passage_text_fileds': ['title', 'text'],
+                    'text_fileds': ['title', 'text'],
                     'qid_field': 'query_id'
                 },
             }
@@ -112,7 +114,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         # load dataset
         ds = MsDataset.load('passage-ranking-demo', 'zyznull')
         train_ds = ds['train'].to_hf_dataset()
-        dev_ds = ds['train'].to_hf_dataset()
+        dev_ds = ds['dev'].to_hf_dataset()
 
         model_id = 'damo/nlp_corom_passage-ranking_english-base'
         self.finetune(
@@ -124,6 +126,70 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
         self.pipeline_text_ranking(output_dir)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_finetune_dureader(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'text-ranking'
+            cfg['preprocessor'] = {'type': 'text-ranking'}
+            cfg.train.optimizer.lr = 2e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['text'],
+                    'qid_field': 'query_id'
+                },
+                'val': {
+                    'type': 'bert',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['text'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg['train']['neg_samples'] = 4
+            cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
+            cfg.train.max_epochs = 1
+            cfg.train.train_batch_size = 4
+            cfg.train.lr_scheduler = {
+                'type': 'LinearLR',
+                'start_factor': 1.0,
+                'end_factor': 0.0,
+                'options': {
+                    'by_epoch': False
+                }
+            }
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 1
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': False,
+                'interval': 5000
+            }]
+            return cfg
+
+        # load dataset
+        ds = MsDataset.load('dureader-retrieval-ranking', 'zyznull')
+        train_ds = ds['train'].to_hf_dataset()
+        dev_ds = ds['dev'].to_hf_dataset()
+
+        model_id = 'damo/nlp_rom_passage-ranking_chinese-base'
+        self.finetune(
+            model_id=model_id,
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn)
+
     def pipeline_text_ranking(self, model_dir):
         model = Model.from_pretrained(model_dir)
         pipeline_ins = pipeline(task=Tasks.text_ranking, model=model)

From b41d275c704624e86eb4c0c4042ad971fcaff67b Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 25 Oct 2022 09:05:33 +0800
Subject: [PATCH 107/129] [to #45703335]feat: refactor deploy         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10501307

---
 modelscope/hub/api.py    | 173 +-------------------------------
 modelscope/hub/deploy.py | 210 +++++++++++++++++++++++++++++++++------
 2 files changed, 183 insertions(+), 200 deletions(-)

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index fd40abdf..f8ca683a 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -12,7 +12,6 @@ from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import List, Optional, Tuple, Union
 
-import attrs
 import requests
 
 from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
@@ -22,14 +21,9 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_USERNAME,
                                       DEFAULT_CREDENTIALS_PATH, Licenses,
                                       ModelVisibility)
-from modelscope.hub.deploy import (DeleteServiceParameters,
-                                   DeployServiceParameters,
-                                   GetServiceParameters, ListServiceParameters,
-                                   ServiceParameters, ServiceResourceConfig,
-                                   Vendor)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
-                                   NotLoginException, NotSupportError,
-                                   RequestError, datahub_raise_on_error,
+                                   NotLoginException, RequestError,
+                                   datahub_raise_on_error,
                                    handle_http_post_error,
                                    handle_http_response, is_ok, raise_on_error)
 from modelscope.hub.git import GitCommandWrapper
@@ -312,169 +306,6 @@ class HubApi:
             r.raise_for_status()
         return None
 
-    def deploy_model(self, model_id: str, revision: str, instance_name: str,
-                     resource: ServiceResourceConfig,
-                     provider: ServiceParameters):
-        """Deploy model to cloud, current we only support PAI EAS, this is asynchronous
-        call , please check instance status through the console or query the instance status.
-        At the same time, this call may take a long time.
-
-        Args:
-            model_id (str): The deployed model id
-            revision (str): The model revision
-            instance_name (str): The deployed model instance name.
-            resource (DeployResource): The resource information.
-            provider (CreateParameter): The cloud service provider parameter
-
-        Raises:
-            NotLoginException: To use this api, you need login first.
-            NotSupportError: Not supported platform.
-            RequestError: The server return error.
-
-        Returns:
-            InstanceInfo: The instance information.
-        """
-        cookies = ModelScopeConfig.get_cookies()
-        if cookies is None:
-            raise NotLoginException(
-                'Token does not exist, please login first.')
-        if provider.vendor != Vendor.EAS:
-            raise NotSupportError(
-                'Not support vendor: %s ,only support EAS current.' %
-                (provider.vendor))
-        create_params = DeployServiceParameters(
-            instance_name=instance_name,
-            model_id=model_id,
-            revision=revision,
-            resource=resource,
-            provider=provider)
-        path = f'{self.endpoint}/api/v1/deployer/endpoint'
-        body = attrs.asdict(create_params)
-        r = requests.post(
-            path,
-            json=body,
-            cookies=cookies,
-        )
-        handle_http_response(r, logger, cookies, 'create_eas_instance')
-        if r.status_code >= HTTPStatus.OK and r.status_code < HTTPStatus.MULTIPLE_CHOICES:
-            if is_ok(r.json()):
-                data = r.json()[API_RESPONSE_FIELD_DATA]
-                return data
-            else:
-                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
-        else:
-            r.raise_for_status()
-        return None
-
-    def list_deployed_model_instances(self,
-                                      provider: ServiceParameters,
-                                      skip: int = 0,
-                                      limit: int = 100):
-        """List deployed model instances.
-
-        Args:
-            provider (ListServiceParameter): The cloud service provider parameter,
-            for eas, need access_key_id and access_key_secret.
-            skip: start of the list, current not support.
-            limit: maximum number of instances return, current not support
-        Raises:
-            NotLoginException: To use this api, you need login first.
-            RequestError: The request is failed from server.
-
-        Returns:
-            List: List of instance information
-        """
-        cookies = ModelScopeConfig.get_cookies()
-        if cookies is None:
-            raise NotLoginException(
-                'Token does not exist, please login first.')
-        params = ListServiceParameters(
-            provider=provider, skip=skip, limit=limit)
-        path = '%s/api/v1/deployer/endpoint?%s' % (self.endpoint,
-                                                   params.to_query_str())
-        r = requests.get(path, cookies=cookies)
-        handle_http_response(r, logger, cookies, 'list_deployed_model')
-        if r.status_code == HTTPStatus.OK:
-            if is_ok(r.json()):
-                data = r.json()[API_RESPONSE_FIELD_DATA]
-                return data
-            else:
-                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
-        else:
-            r.raise_for_status()
-        return None
-
-    def get_deployed_model_instance(self, instance_name: str,
-                                    provider: ServiceParameters):
-        """Query the specified instance information.
-
-        Args:
-            instance_name (str): The deployed instance name.
-            provider (GetParameter): The cloud provider information, for eas
-            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
-
-        Raises:
-            NotLoginException: To use this api, you need login first.
-            RequestError: The request is failed from server.
-
-        Returns:
-            Dict: The request instance information
-        """
-        cookies = ModelScopeConfig.get_cookies()
-        if cookies is None:
-            raise NotLoginException(
-                'Token does not exist, please login first.')
-        params = GetServiceParameters(provider=provider)
-        path = '%s/api/v1/deployer/endpoint/%s?%s' % (
-            self.endpoint, instance_name, params.to_query_str())
-        r = requests.get(path, cookies=cookies)
-        handle_http_response(r, logger, cookies, 'get_deployed_model')
-        if r.status_code == HTTPStatus.OK:
-            if is_ok(r.json()):
-                data = r.json()[API_RESPONSE_FIELD_DATA]
-                return data
-            else:
-                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
-        else:
-            r.raise_for_status()
-        return None
-
-    def delete_deployed_model_instance(self, instance_name: str,
-                                       provider: ServiceParameters):
-        """Delete deployed model, this api send delete command and return, it will take
-        some to delete, please check through the cloud console.
-
-        Args:
-            instance_name (str): The instance name you want to delete.
-            provider (DeleteParameter): The cloud provider information, for eas
-            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
-
-        Raises:
-            NotLoginException: To call this api, you need login first.
-            RequestError: The request is failed.
-
-        Returns:
-            Dict: The deleted instance information.
-        """
-        cookies = ModelScopeConfig.get_cookies()
-        if cookies is None:
-            raise NotLoginException(
-                'Token does not exist, please login first.')
-        params = DeleteServiceParameters(provider=provider)
-        path = '%s/api/v1/deployer/endpoint/%s?%s' % (
-            self.endpoint, instance_name, params.to_query_str())
-        r = requests.delete(path, cookies=cookies)
-        handle_http_response(r, logger, cookies, 'delete_deployed_model')
-        if r.status_code == HTTPStatus.OK:
-            if is_ok(r.json()):
-                data = r.json()[API_RESPONSE_FIELD_DATA]
-                return data
-            else:
-                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
-        else:
-            r.raise_for_status()
-        return None
-
     def _check_cookie(self,
                       use_cookies: Union[bool,
                                          CookieJar] = False) -> CookieJar:
diff --git a/modelscope/hub/deploy.py b/modelscope/hub/deploy.py
index 64594e0d..397d9cc0 100644
--- a/modelscope/hub/deploy.py
+++ b/modelscope/hub/deploy.py
@@ -1,11 +1,25 @@
 import urllib
-from abc import ABC, abstractmethod
-from typing import Optional, Union
+from abc import ABC
+from http import HTTPStatus
+from typing import Optional
 
+import attrs
 import json
-from attr import fields
+import requests
 from attrs import asdict, define, field, validators
 
+from modelscope.hub.api import ModelScopeConfig
+from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
+                                      API_RESPONSE_FIELD_MESSAGE)
+from modelscope.hub.errors import (NotLoginException, NotSupportError,
+                                   RequestError, handle_http_response, is_ok)
+from modelscope.hub.utils.utils import get_endpoint
+from modelscope.utils.logger import get_logger
+
+# yapf: enable
+
+logger = get_logger()
+
 
 class Accelerator(object):
     CPU = 'cpu'
@@ -76,12 +90,12 @@ class ServiceResourceConfig(object):
 
 
 @define
-class ServiceParameters(ABC):
+class ServiceProviderParameters(ABC):
     pass
 
 
 @define
-class EASDeployParameters(ServiceParameters):
+class EASDeployParameters(ServiceProviderParameters):
     """Parameters for EAS Deployment.
 
     Args:
@@ -97,29 +111,10 @@ class EASDeployParameters(ServiceParameters):
     resource_group: Optional[str] = None
     vendor: str = field(
         default=Vendor.EAS, validator=validators.in_([Vendor.EAS]))
-    """
-    def __init__(self,
-                 instance_name: str,
-                 access_key_id: str,
-                 access_key_secret: str,
-                 region = EASRegion.beijing,
-                 instance_type: str  = EASCpuInstances.small,
-                 accelerator: str =  Accelerator.CPU,
-                 resource_group: Optional[str] = None,
-                 scaling: Optional[str] = None):
-        self.instance_name=instance_name
-        self.access_key_id=self.access_key_id
-        self.access_key_secret = access_key_secret
-        self.region = region
-        self.instance_type = instance_type
-        self.accelerator = accelerator
-        self.resource_group = resource_group
-        self.scaling = scaling
-    """
 
 
 @define
-class EASListParameters(ServiceParameters):
+class EASListParameters(ServiceProviderParameters):
     """EAS instance list parameters.
 
     Args:
@@ -152,7 +147,7 @@ class DeployServiceParameters(object):
     model_id: str
     revision: str
     resource: ServiceResourceConfig
-    provider: ServiceParameters
+    provider: ServiceProviderParameters
 
 
 class AttrsToQueryString(ABC):
@@ -174,16 +169,173 @@ class AttrsToQueryString(ABC):
 
 @define
 class ListServiceParameters(AttrsToQueryString):
-    provider: ServiceParameters
+    provider: ServiceProviderParameters
     skip: int = 0
     limit: int = 100
 
 
 @define
 class GetServiceParameters(AttrsToQueryString):
-    provider: ServiceParameters
+    provider: ServiceProviderParameters
 
 
 @define
 class DeleteServiceParameters(AttrsToQueryString):
-    provider: ServiceParameters
+    provider: ServiceProviderParameters
+
+
+class ServiceDeployer(object):
+
+    def __init__(self, endpoint=None):
+        self.endpoint = endpoint if endpoint is not None else get_endpoint()
+        self.cookies = ModelScopeConfig.get_cookies()
+        if self.cookies is None:
+            raise NotLoginException(
+                'Token does not exist, please login with HubApi first.')
+
+    # deploy_model
+    def create(self, model_id: str, revision: str, instance_name: str,
+               resource: ServiceResourceConfig,
+               provider: ServiceProviderParameters):
+        """Deploy model to cloud, current we only support PAI EAS, this is an async API ,
+        and the deployment could take a while to finish remotely. Please check deploy instance
+        status separately via checking the status.
+
+        Args:
+            model_id (str): The deployed model id
+            revision (str): The model revision
+            instance_name (str): The deployed model instance name.
+            resource (ServiceResourceConfig): The service resource information.
+            provider (ServiceProviderParameters): The service provider parameter
+
+        Raises:
+            NotLoginException: To use this api, you need login first.
+            NotSupportError: Not supported platform.
+            RequestError: The server return error.
+
+        Returns:
+            ServiceInstanceInfo: The information of the deployed service instance.
+        """
+        if provider.vendor != Vendor.EAS:
+            raise NotSupportError(
+                'Not support vendor: %s ,only support EAS current.' %
+                (provider.vendor))
+        create_params = DeployServiceParameters(
+            instance_name=instance_name,
+            model_id=model_id,
+            revision=revision,
+            resource=resource,
+            provider=provider)
+        path = f'{self.endpoint}/api/v1/deployer/endpoint'
+        body = attrs.asdict(create_params)
+        r = requests.post(
+            path,
+            json=body,
+            cookies=self.cookies,
+        )
+        handle_http_response(r, logger, self.cookies, 'create_service')
+        if r.status_code >= HTTPStatus.OK and r.status_code < HTTPStatus.MULTIPLE_CHOICES:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
+    def get(self, instance_name: str, provider: ServiceProviderParameters):
+        """Query the specified instance information.
+
+        Args:
+            instance_name (str): The deployed instance name.
+            provider (ServiceProviderParameters): The cloud provider information, for eas
+            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
+
+        Raises:
+            NotLoginException: To use this api, you need login first.
+            RequestError: The request is failed from server.
+
+        Returns:
+            Dict: The information of the requested service instance.
+        """
+        params = GetServiceParameters(provider=provider)
+        path = '%s/api/v1/deployer/endpoint/%s?%s' % (
+            self.endpoint, instance_name, params.to_query_str())
+        r = requests.get(path, cookies=self.cookies)
+        handle_http_response(r, logger, self.cookies, 'get_service')
+        if r.status_code == HTTPStatus.OK:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
+    def delete(self, instance_name: str, provider: ServiceProviderParameters):
+        """Delete deployed model, this api send delete command and return, it will take
+        some to delete, please check through the cloud console.
+
+        Args:
+            instance_name (str): The instance name you want to delete.
+            provider (ServiceProviderParameters): The cloud provider information, for eas
+            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
+
+        Raises:
+            NotLoginException: To call this api, you need login first.
+            RequestError: The request is failed.
+
+        Returns:
+            Dict: The deleted instance information.
+        """
+        params = DeleteServiceParameters(provider=provider)
+        path = '%s/api/v1/deployer/endpoint/%s?%s' % (
+            self.endpoint, instance_name, params.to_query_str())
+        r = requests.delete(path, cookies=self.cookies)
+        handle_http_response(r, logger, self.cookies, 'delete_service')
+        if r.status_code == HTTPStatus.OK:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None
+
+    def list(self,
+             provider: ServiceProviderParameters,
+             skip: int = 0,
+             limit: int = 100):
+        """List deployed model instances.
+
+        Args:
+            provider (ServiceProviderParameters): The cloud service provider parameter,
+            for eas, need access_key_id and access_key_secret.
+            skip: start of the list, current not support.
+            limit: maximum number of instances return, current not support
+        Raises:
+            NotLoginException: To use this api, you need login first.
+            RequestError: The request is failed from server.
+
+        Returns:
+            List: List of instance information
+        """
+
+        params = ListServiceParameters(
+            provider=provider, skip=skip, limit=limit)
+        path = '%s/api/v1/deployer/endpoint?%s' % (self.endpoint,
+                                                   params.to_query_str())
+        r = requests.get(path, cookies=self.cookies)
+        handle_http_response(r, logger, self.cookies, 'list_service_instances')
+        if r.status_code == HTTPStatus.OK:
+            if is_ok(r.json()):
+                data = r.json()[API_RESPONSE_FIELD_DATA]
+                return data
+            else:
+                raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
+        else:
+            r.raise_for_status()
+        return None

From de7b6a06e9e2762ccb1afdaf3acc8536aa4f00b6 Mon Sep 17 00:00:00 2001
From: "yingda.chen" <yingda.chen@alibaba-inc.com>
Date: Tue, 25 Oct 2022 09:28:01 +0800
Subject: [PATCH 108/129] [to #42322933] remove revision usage for face
 detection         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10507910

    * [to #42322933] remove revision usage for face detection
---
 modelscope/pipelines/cv/face_recognition_pipeline.py | 2 +-
 tests/pipelines/test_face_detection.py               | 6 ++----
 tests/trainers/test_face_detection_scrfd_trainer.py  | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index abae69d4..873e4a1f 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -49,7 +49,7 @@ class FaceRecognitionPipeline(Pipeline):
         # face detect pipeline
         det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
         self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id, model_revision='v2')
+            Tasks.face_detection, model=det_model_id)
 
     def _choose_face(self,
                      det_result,
diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py
index 31ae403e..db513a80 100644
--- a/tests/pipelines/test_face_detection.py
+++ b/tests/pipelines/test_face_detection.py
@@ -28,8 +28,7 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         input_location = ['data/test/images/face_detection2.jpeg']
 
         dataset = MsDataset.load(input_location, target='image')
-        face_detection = pipeline(
-            Tasks.face_detection, model=self.model_id, model_revision='v2')
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
         # note that for dataset output, the inference-output is a Generator that can be iterated.
         result = face_detection(dataset)
         result = next(result)
@@ -37,8 +36,7 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_modelhub(self):
-        face_detection = pipeline(
-            Tasks.face_detection, model=self.model_id, model_revision='v2')
+        face_detection = pipeline(Tasks.face_detection, model=self.model_id)
         img_path = 'data/test/images/face_detection2.jpeg'
 
         result = face_detection(img_path)
diff --git a/tests/trainers/test_face_detection_scrfd_trainer.py b/tests/trainers/test_face_detection_scrfd_trainer.py
index eb9440ef..97b0eca7 100644
--- a/tests/trainers/test_face_detection_scrfd_trainer.py
+++ b/tests/trainers/test_face_detection_scrfd_trainer.py
@@ -28,7 +28,7 @@ def _setup():
     val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/'
     max_epochs = 1  # run epochs in unit test
 
-    cache_path = snapshot_download(model_id, revision='v2')
+    cache_path = snapshot_download(model_id)
 
     tmp_dir = tempfile.TemporaryDirectory().name
     if not os.path.exists(tmp_dir):

From 6178f46910b9a909a989d68645c4a6bb4eb6bd05 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Tue, 25 Oct 2022 09:49:02 +0800
Subject: [PATCH 109/129] [to #42322933] add ut for multi threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. 修复multi thread引起的问题
2. 增加multi thread的unittest
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10502008
---
 .../nlp/table_question_answering_pipeline.py  |  6 ++++-
 .../space_T_cn/fields/database.py             |  3 ++-
 .../test_table_question_answering.py          | 24 +++++++++++++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 52ba33e0..fc0d07b1 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -17,6 +17,9 @@ from modelscope.preprocessors.space_T_cn.fields.database import Database
 from modelscope.preprocessors.space_T_cn.fields.struct import (Constant,
                                                                SQLQuery)
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
 
 __all__ = ['TableQuestionAnsweringPipeline']
 
@@ -309,7 +312,8 @@ class TableQuestionAnsweringPipeline(Pipeline):
                     'header_name': header_names,
                     'rows': rows
                 }
-            except Exception:
+            except Exception as e:
+                logger.error(e)
                 tabledata = {'header_id': [], 'header_name': [], 'rows': []}
         else:
             tabledata = {'header_id': [], 'header_name': [], 'rows': []}
diff --git a/modelscope/preprocessors/space_T_cn/fields/database.py b/modelscope/preprocessors/space_T_cn/fields/database.py
index 481bd1db..7ae38ee2 100644
--- a/modelscope/preprocessors/space_T_cn/fields/database.py
+++ b/modelscope/preprocessors/space_T_cn/fields/database.py
@@ -17,7 +17,8 @@ class Database:
         self.tokenizer = tokenizer
         self.is_use_sqlite = is_use_sqlite
         if self.is_use_sqlite:
-            self.connection_obj = sqlite3.connect(':memory:')
+            self.connection_obj = sqlite3.connect(
+                ':memory:', check_same_thread=False)
             self.type_dict = {'text': 'TEXT', 'number': 'INT', 'date': 'TEXT'}
         self.tables = self.init_tables(table_file_path=table_file_path)
         self.syn_dict = self.init_syn_dict(
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 828ef5ac..44f1531b 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import unittest
+from threading import Thread
 from typing import List
 
 import json
@@ -108,8 +109,6 @@ class TableQuestionAnswering(unittest.TestCase):
         self.task = Tasks.table_question_answering
         self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
 
-    model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
@@ -122,6 +121,27 @@ class TableQuestionAnswering(unittest.TestCase):
         ]
         tableqa_tracking_and_print_results_with_history(pipelines)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download_with_multithreads(self):
+        cache_path = snapshot_download(self.model_id)
+        pl = pipeline(Tasks.table_question_answering, model=cache_path)
+
+        def print_func(pl, i):
+            result = pl({
+                'question': '长江流域的小(2)型水库的库容总量是多少？',
+                'table_id': 'reservoir',
+                'history_sql': None
+            })
+            print(i, json.dumps(result))
+
+        procs = []
+        for i in range(5):
+            proc = Thread(target=print_func, args=(pl, i))
+            procs.append(proc)
+            proc.start()
+        for proc in procs:
+            proc.join()
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)

From a1738690c9f086a09831da059a5f7a6bb1736ebb Mon Sep 17 00:00:00 2001
From: "huizheng.hz" <huizheng.hz@alibaba-inc.com>
Date: Tue, 25 Oct 2022 10:08:57 +0800
Subject: [PATCH 110/129] [to #42322933]test_image_denoise_trainer        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10465138

---
 tests/trainers/test_image_denoise_trainer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py
index c4abca6a..b742dcae 100644
--- a/tests/trainers/test_image_denoise_trainer.py
+++ b/tests/trainers/test_image_denoise_trainer.py
@@ -34,14 +34,14 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
             'SIDD',
             namespace='huizheng',
             subset_name='default',
-            split='validation',
-            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+            split='test',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)._hf_ds
         dataset_val = MsDataset.load(
             'SIDD',
             namespace='huizheng',
             subset_name='default',
             split='test',
-            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)._hf_ds
         self.dataset_train = SiddImageDenoisingDataset(
             dataset_train, self.config.dataset, is_train=True)
         self.dataset_val = SiddImageDenoisingDataset(
@@ -51,7 +51,7 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
@@ -65,7 +65,7 @@ class ImageDenoiseTrainerTest(unittest.TestCase):
         for i in range(2):
             self.assertIn(f'epoch_{i+1}.pth', results_files)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         model = NAFNetForImageDenoise.from_pretrained(self.cache_path)
         kwargs = dict(

From 525fa3ea89d678e5c6e7c6d42616304454ca35ef Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 25 Oct 2022 12:10:07 +0800
Subject: [PATCH 111/129] [to #42322933]test: use 'master' branch in training
 test         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10509580

---
 .../pipelines/test_key_word_spotting_farfield.py  |  5 ++---
 tests/pipelines/test_speech_signal_process.py     | 15 ++++++---------
 tests/trainers/audio/test_ans_trainer.py          |  4 +---
 tests/trainers/audio/test_kws_farfield_trainer.py |  2 --
 4 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
index bf61c9e7..69d6a953 100644
--- a/tests/pipelines/test_key_word_spotting_farfield.py
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -9,9 +9,8 @@ from modelscope.utils.test_utils import test_level
 
 TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
 TEST_SPEECH_FILE_MONO = 'data/test/audios/1ch_nihaomiya.wav'
-TEST_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
-                  'speech_dfsmn_kws_char_farfield_16k_nihaomiya/repo' \
-                  '?Revision=master&FilePath=examples/3ch_nihaomiya.wav'
+TEST_SPEECH_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/' \
+                  'test/audios/3ch_nihaomiya.wav'
 
 
 class KWSFarfieldTest(unittest.TestCase):
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index e5f97c02..2916d31a 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -11,17 +11,14 @@ from modelscope.utils.test_utils import test_level
 
 NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav'
 FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav'
-NEAREND_MIC_URL = 'https://modelscope.cn/api/v1/models/damo/' \
-                  'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
-                  '&FilePath=examples/nearend_mic.wav'
-FAREND_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
-                    'speech_dfsmn_aec_psm_16k/repo?Revision=master' \
-                    '&FilePath=examples/farend_speech.wav'
+NEAREND_MIC_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/' \
+                  'test/audios/nearend_mic.wav'
+FAREND_SPEECH_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/' \
+                    'test/audios/farend_speech.wav'
 
 NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav'
-NOISE_SPEECH_URL = 'https://modelscope.cn/api/v1/models/damo/' \
-                   'speech_frcrn_ans_cirm_16k/repo?Revision=master' \
-                   '&FilePath=examples/speech_with_noise.wav'
+NOISE_SPEECH_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/' \
+                   'test/audios/speech_with_noise.wav'
 
 
 class SpeechSignalProcessTest(unittest.TestCase, DemoCompatibilityCheck):
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
index c0860529..d897e6a9 100644
--- a/tests/trainers/audio/test_ans_trainer.py
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -17,7 +17,6 @@ SEGMENT_LENGTH_TEST = 640
 
 
 class TestANSTrainer(unittest.TestCase):
-    REVISION = 'beta'
 
     def setUp(self):
         self.tmp_dir = tempfile.TemporaryDirectory().name
@@ -25,7 +24,7 @@ class TestANSTrainer(unittest.TestCase):
             os.makedirs(self.tmp_dir)
 
         self.model_id = 'damo/speech_frcrn_ans_cirm_16k'
-        cfg = read_config(self.model_id, revision=self.REVISION)
+        cfg = read_config(self.model_id)
         cfg.train.max_epochs = 2
         cfg.train.dataloader.batch_size_per_gpu = 1
         self.cfg_file = os.path.join(self.tmp_dir, 'train_config.json')
@@ -48,7 +47,6 @@ class TestANSTrainer(unittest.TestCase):
     def test_trainer(self):
         kwargs = dict(
             model=self.model_id,
-            model_revision=self.REVISION,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
             max_epochs=2,
diff --git a/tests/trainers/audio/test_kws_farfield_trainer.py b/tests/trainers/audio/test_kws_farfield_trainer.py
index 2631a542..70b68a11 100644
--- a/tests/trainers/audio/test_kws_farfield_trainer.py
+++ b/tests/trainers/audio/test_kws_farfield_trainer.py
@@ -16,7 +16,6 @@ NOISE_2CH_FILE = 'data/test/audios/noise_2ch.wav'
 
 
 class TestKwsFarfieldTrainer(unittest.TestCase):
-    REVISION = 'beta'
 
     def setUp(self):
         self.tmp_dir = tempfile.TemporaryDirectory().name
@@ -70,7 +69,6 @@ class TestKwsFarfieldTrainer(unittest.TestCase):
         kwargs = dict(
             model=self.model_id,
             work_dir=self.tmp_dir,
-            model_revision=self.REVISION,
             workers=2,
             max_epochs=2,
             train_iters_per_epoch=2,

From 6d51f44dc710ab4dd5816cb8c65192b24fe4a4b8 Mon Sep 17 00:00:00 2001
From: "siyang.ssy" <siyang.ssy@alibaba-inc.com>
Date: Tue, 25 Oct 2022 12:11:28 +0800
Subject: [PATCH 112/129] [to #42322933]fix input type for video embeding      
   Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10506601

---
 .../mmr/models/clip_for_mm_video_embedding.py | 41 +++++++++++++++----
 .../test_video_multi_modal_embedding.py       |  4 +-
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 2a72985f..f1b1a6c7 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,9 +1,13 @@
 # The implementation is adopted from the CLIP4Clip implementation,
 # made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
+import os
 import random
+import uuid
 from os.path import exists
+from tempfile import TemporaryDirectory
 from typing import Any, Dict
+from urllib.parse import urlparse
 
 import json
 import numpy as np
@@ -11,6 +15,7 @@ import torch
 from decord import VideoReader, cpu
 from PIL import Image
 
+from modelscope.hub.file_download import http_get_file
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
@@ -68,12 +73,16 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
         self.model.to(self.device)
 
     def _get_text(self, caption, tokenizer, enable_zh=False):
-        if len(caption) == 3:
-            _caption_text, s, e = caption
-        elif len(caption) == 4:
-            _caption_text, s, e, pos = caption
-        else:
-            NotImplementedError
+
+        if type(caption) is str:
+            _caption_text, s, e = caption, None, None
+        elif type(caption) is tuple:
+            if len(caption) == 3:
+                _caption_text, s, e = caption
+            elif len(caption) == 4:
+                _caption_text, s, e, pos = caption
+            else:
+                NotImplementedError
 
         if isinstance(_caption_text, list):
             caption_text = random.choice(_caption_text)
@@ -137,11 +146,25 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
             elif start_time == end_time:
                 end_time = end_time + 1
 
-        if exists(video_path):
+        url_parsed = urlparse(video_path)
+        if url_parsed.scheme in ('file', '') and exists(
+                url_parsed.path):  # Possibly a local file
             vreader = VideoReader(video_path, ctx=cpu(0))
         else:
-            logger.error('non video input, output is wrong!!!')
-            return video, video_mask
+            try:
+                with TemporaryDirectory() as temporary_cache_dir:
+                    random_str = uuid.uuid4().hex
+                    http_get_file(
+                        url=video_path,
+                        local_dir=temporary_cache_dir,
+                        file_name=random_str,
+                        cookies=None)
+                    temp_file_path = os.path.join(temporary_cache_dir,
+                                                  random_str)
+                    vreader = VideoReader(temp_file_path, ctx=cpu(0))
+            except Exception as ex:
+                logger.error('non video input, output is {}!!!'.format(ex))
+                return video, video_mask
 
         fps = vreader.get_avg_fps()
         f_start = 0 if start_time is None else int(start_time * fps)
diff --git a/tests/pipelines/test_video_multi_modal_embedding.py b/tests/pipelines/test_video_multi_modal_embedding.py
index f4aa4d24..afe5940d 100644
--- a/tests/pipelines/test_video_multi_modal_embedding.py
+++ b/tests/pipelines/test_video_multi_modal_embedding.py
@@ -17,8 +17,8 @@ class VideoMultiModalEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
         self.task = Tasks.video_multi_modal_embedding
         self.model_id = 'damo/multi_modal_clip_vtretrival_msrvtt_53'
 
-    video_path = 'data/test/videos/multi_modal_test_video_9770.mp4'
-    caption = ('a person is connecting something to system', None, None)
+    video_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/multi_modal_test_video_9770.mp4'
+    caption = 'a person is connecting something to system'
     _input = {'video': video_path, 'text': caption}
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')

From 605cd7f44a19b083b8fe4a7ca78d749ab8b52574 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 25 Oct 2022 12:26:25 +0800
Subject: [PATCH 113/129] [to #42322933] NLP 1030 Refactor

Features:
1. Refactor the directory structure of nlp models. All model files are placed into either the model folder or the task_model folder
2. Refactor all the comments to google style
3. Add detail comments to important tasks and nlp models, to list the description of the model, and its preprocessor&trainer
4. Model Exporting now supports a direct all to TorchModelExporter(no need to derive from it)
5. Refactor model save_pretrained method to support direct running(independent from trainer)
6. Remove the judgement of Model in the pipeline base class, to support outer register models running in our pipelines
7. Nlp trainer now has a NLPTrainingArguments class , user can pass arguments into the dataclass, and use it as a normal cfg_modify_fn, to simplify the operation of modify cfg.
8. Merge the BACKBONES and the MODELS, so user can get a backbone with the Model.from_pretrained call
9. Model.from_pretrained now support a task argument, so user can use a backbone and load it with a specific task class.
10. Support Preprocessor.from_pretrained method
11. Add standard return classes to important nlp tasks, so some of the pipelines and the models are independent now, the return values of the models will always be tensors, and the pipelines will take care of the conversion to numpy and the following stuffs.
12. Split the file of the nlp preprocessors, to make the dir structure more clear.

Bugs Fixing:
1. Fix a bug that lr_scheduler can be called earlier than the optimizer's step
2. Fix a bug that the direct call of Pipelines (not from pipeline(xxx)) throws error
3. Fix a bug that the trainer will not call the correct TaskDataset class
4. Fix a bug that the internal loading of dataset will throws error in the trainer class
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10490585
---
 data/test/regression/fill_mask_sbert_zh.bin   |    4 +-
 data/test/regression/fill_mask_veco_en.bin    |    4 +-
 data/test/regression/fill_mask_veco_zh.bin    |    4 +-
 data/test/regression/sbert-base-tnews.bin     |    4 +-
 data/test/regression/sbert_nli.bin            |    4 +-
 data/test/regression/sbert_sen_sim.bin        |    4 +-
 data/test/regression/sbert_ws_en.bin          |    4 +-
 data/test/regression/sbert_ws_zh.bin          |    4 +-
 data/test/regression/sbert_zero_shot.bin      |    4 +-
 modelscope/exporters/base.py                  |   22 +-
 ...rt_for_sequence_classification_exporter.py |   13 +-
 modelscope/exporters/torch_model_exporter.py  |  115 +-
 modelscope/metainfo.py                        |    1 -
 modelscope/metrics/base.py                    |    3 -
 .../metrics/token_classification_metric.py    |   17 +-
 modelscope/models/base/base_model.py          |   26 +-
 modelscope/models/builder.py                  |   17 +-
 modelscope/models/nlp/T5/__init__.py          |    8 +-
 .../nlp/T5/{modeling_t5.py => backbone.py}    | 1014 +++------
 .../{configuration_t5.py => configuration.py} |    1 +
 .../models/nlp/T5/t5_for_text_generation.py   |   56 -
 .../models/nlp/T5/text2text_generation.py     |  455 ++++
 modelscope/models/nlp/__init__.py             |  125 +-
 modelscope/models/nlp/backbones/bert.py       |    7 -
 modelscope/models/nlp/backbones/structbert.py |   52 -
 modelscope/models/nlp/bart/__init__.py        |    2 +
 .../text_error_correction.py}                 |    0
 modelscope/models/nlp/bert/__init__.py        |   48 +-
 modelscope/models/nlp/bert/backbone.py        |  952 ++++++++
 ...configuration_bert.py => configuration.py} |    1 +
 .../document_segmentation.py}                 |    1 +
 modelscope/models/nlp/bert/fill_mask.py       |  299 +++
 modelscope/models/nlp/bert/modeling_bert.py   | 1961 ----------------
 .../models/nlp/bert/sentence_embedding.py     |  113 +
 .../models/nlp/bert/text_classification.py    |  208 ++
 modelscope/models/nlp/bert/text_ranking.py    |   89 +
 .../models/nlp/bert/token_classification.py   |  225 ++
 modelscope/models/nlp/csanmt/__init__.py      |    2 +
 .../translation.py}                           |    0
 modelscope/models/nlp/deberta_v2/__init__.py  |   40 +-
 .../{modeling_deberta_v2.py => backbone.py}   |  729 +-----
 ...uration_deberta_v2.py => configuration.py} |    2 -
 modelscope/models/nlp/deberta_v2/fill_mask.py |  230 ++
 ...nization_deberta_v2.py => tokenization.py} |    0
 ...eberta_v2_fast.py => tokenization_fast.py} |    2 +-
 modelscope/models/nlp/gpt3/__init__.py        |   16 +-
 .../gpt3/{modeling_gpt3.py => backbone.py}    |    2 +-
 ...configuration_gpt3.py => configuration.py} |    0
 ..._text_generation.py => text_generation.py} |    0
 .../gpt3/{tokenizer_gpt3.py => tokenizer.py}  |    0
 .../nlp/{backbones => gpt_neo}/__init__.py    |    6 +-
 .../gpt_neo.py => gpt_neo/backbone.py}        |    5 +-
 .../nlp/heads/token_classification_head.py    |    4 +-
 modelscope/models/nlp/masked_language.py      |  164 --
 modelscope/models/nlp/palm_v2/__init__.py     |   12 +-
 .../palm_v2/{modeling_palm.py => backbone.py} |    2 +-
 ...configuration_palm.py => configuration.py} |    0
 ..._text_generation.py => text_generation.py} |    0
 modelscope/models/nlp/plug/__init__.py        |    8 +-
 .../plug/{modeling_plug.py => backbone.py}    |    2 +-
 ...configuration_plug.py => configuration.py} |    0
 .../models/nlp/plug/distributed_plug.py       |    3 +-
 modelscope/models/nlp/ponet/__init__.py       |   16 +-
 .../ponet/{modeling_ponet.py => backbone.py}  |  857 +------
 ...onfiguration_ponet.py => configuration.py} |    6 +-
 modelscope/models/nlp/ponet/fill_mask.py      |  252 +++
 ...{tokenization_ponet.py => tokenization.py} |    1 +
 .../models/nlp/ponet_for_masked_language.py   |   53 -
 modelscope/models/nlp/sentence_embedding.py   |   74 -
 .../models/nlp/sequence_classification.py     |  287 ---
 modelscope/models/nlp/space/__init__.py       |   20 +-
 ...onfiguration_space.py => configuration.py} |    0
 ...diction.py => dialog_intent_prediction.py} |   23 +-
 ..._dialog_modeling.py => dialog_modeling.py} |   22 +-
 ...ling_space.py => dialog_state_tracking.py} |  224 +-
 modelscope/models/nlp/space/model/__init__.py |    4 +-
 .../models/nlp/space/model/generator.py       |   19 +-
 .../models/nlp/space/model/model_base.py      |   11 +-
 .../nlp/space/model/tokenization_space.py     |    2 +-
 .../nlp/space/model/unified_transformer.py    |   22 +-
 .../nlp/space/modules/transformer_block.py    |   17 +-
 .../space/space_for_dialog_state_tracking.py  |  101 -
 modelscope/models/nlp/space_T_cn/__init__.py  |   21 +
 .../{modeling_space_T_cn.py => backbone.py}   |    5 +-
 ...uration_space_T_cn.py => configuration.py} |    2 +-
 .../table_question_answering.py               |   38 +-
 modelscope/models/nlp/space_T_en/__init__.py  |   21 +
 .../text_to_sql.py}                           |   34 +-
 modelscope/models/nlp/structbert/__init__.py  |   28 +-
 modelscope/models/nlp/structbert/backbone.py  |  932 ++++++++
 ...onfiguration_sbert.py => configuration.py} |   12 +-
 .../faq_question_answering.py}                |  238 +-
 modelscope/models/nlp/structbert/fill_mask.py |  284 +++
 .../models/nlp/structbert/modeling_sbert.py   | 1963 -----------------
 .../nlp/structbert/text_classification.py     |  235 ++
 .../nlp/structbert/token_classification.py    |  229 ++
 ...{tokenization_sbert.py => tokenization.py} |    0
 ...ion_sbert_fast.py => tokenization_fast.py} |    2 +-
 modelscope/models/nlp/task_models/__init__.py |    7 +
 .../nlp/task_models/feature_extraction.py     |   10 +-
 .../models/nlp/task_models/fill_mask.py       |    3 +-
 .../nlp/task_models/information_extraction.py |    2 +-
 .../nncrf_for_named_entity_recognition.py     |  121 +-
 .../task_models/sequence_classification.py    |   24 +-
 .../models/nlp/task_models/task_model.py      |    4 +-
 .../nlp/task_models/token_classification.py   |   31 +-
 modelscope/models/nlp/text_ranking.py         |   80 -
 modelscope/models/nlp/token_classification.py |  245 --
 modelscope/models/nlp/veco/__init__.py        |   24 +-
 modelscope/models/nlp/veco/backbone.py        |   96 +
 ...configuration_veco.py => configuration.py} |    0
 modelscope/models/nlp/veco/fill_mask.py       |   99 +
 modelscope/models/nlp/veco/modeling_veco.py   |  143 --
 .../models/nlp/veco/text_classification.py    |  150 ++
 .../models/nlp/veco/token_classification.py   |  107 +
 .../{tokenization_veco.py => tokenization.py} |    0
 ...tion_veco_fast.py => tokenization_fast.py} |    2 +-
 .../task_datasets/torch_base_dataset.py       |    1 +
 modelscope/outputs/__init__.py                |    2 +
 .../fields => outputs/nlp}/__init__.py        |    0
 modelscope/outputs/nlp/model_outputs.py       |  543 +++++
 modelscope/{ => outputs}/outputs.py           |   61 +-
 modelscope/pipelines/base.py                  |   36 +-
 modelscope/pipelines/builder.py               |    6 +-
 modelscope/pipelines/nlp/__init__.py          |    8 +-
 .../conversational_text_to_sql_pipeline.py    |   13 -
 .../nlp/dialog_state_tracking_pipeline.py     |    7 +-
 .../nlp/distributed_plug_pipeline.py          |   11 +-
 .../nlp/faq_question_answering_pipeline.py    |   28 +-
 .../pipelines/nlp/fill_mask_pipeline.py       |  118 +-
 .../pipelines/nlp/fill_mask_ponet_pipeline.py |  136 --
 .../nlp/named_entity_recognition_pipeline.py  |   69 +-
 .../nlp/sentence_embedding_pipeline.py        |   24 +-
 .../nlp/sequence_classification_pipeline.py   |   84 -
 .../nlp/table_question_answering_pipeline.py  |    6 +-
 .../nlp/text_classification_pipeline.py       |  125 +-
 .../pipelines/nlp/text_ranking_pipeline.py    |   20 +-
 .../nlp/token_classification_pipeline.py      |  104 +-
 .../pipelines/nlp/translation_pipeline.py     |    3 +-
 .../nlp/word_segmentation_pipeline.py         |   66 +-
 .../nlp/zero_shot_classification_pipeline.py  |    5 +-
 modelscope/preprocessors/__init__.py          |   61 +-
 modelscope/preprocessors/base.py              |   73 +-
 modelscope/preprocessors/nlp/__init__.py      |   88 +-
 .../nlp/bert_seq_cls_tokenizer.py             |   23 +
 .../nlp/document_segmentation_preprocessor.py |  220 ++
 .../faq_question_answering_preprocessor.py    |   90 +
 .../nlp/fill_mask_preprocessor.py             |  142 ++
 modelscope/preprocessors/nlp/nlp_base.py      | 1178 ++--------
 .../nlp/relation_extraction_preprocessor.py   |   55 +
 .../sentence_classification_preprocessor.py   |   25 +
 .../nlp/sentence_embedding_preprocessor.py    |   52 +
 .../nlp/sentence_piece_preprocessor.py        |   32 +
 .../preprocessors/{ => nlp}/space/__init__.py |    0
 .../preprocessors/{ => nlp}/space/args.py     |    5 +-
 .../preprocessors/{ => nlp}/space/batch.py    |    3 +
 .../{ => nlp}/space/data_loader.py            |   16 +-
 .../dialog_intent_prediction_preprocessor.py  |   20 +-
 .../space/dialog_modeling_preprocessor.py     |   17 +-
 .../dialog_state_tracking_preprocessor.py     |   10 +-
 .../{ => nlp}/space/dst_processors.py         |    0
 .../nlp/space/fields/__init__.py              |   23 +
 .../{ => nlp}/space/fields/gen_field.py       |    2 +-
 .../{ => nlp}/space/fields/intent_field.py    |    2 +-
 .../{ => nlp}/space/lazy_dataset.py           |    7 +-
 .../{ => nlp}/space/preprocess.py             |    7 +-
 .../preprocessors/{ => nlp}/space/sampler.py  |    4 +-
 .../{ => nlp}/space/tensorlistdataset.py      |    0
 .../{ => nlp}/space/tokenizer.py              |    2 +
 .../{ => nlp}/space_T_cn/__init__.py          |    0
 .../nlp/space_T_cn/fields/__init__.py         |    0
 .../{ => nlp}/space_T_cn/fields/database.py   |    2 +-
 .../space_T_cn/fields/schema_link.py          |    2 +-
 .../{ => nlp}/space_T_cn/fields/struct.py     |    0
 .../table_question_answering_preprocessor.py  |    5 +-
 .../{star => nlp/space_T_en}/__init__.py      |    0
 ...conversational_text_to_sql_preprocessor.py |   17 +-
 .../space_T_en}/fields/__init__.py            |    0
 .../space_T_en}/fields/common_utils.py        |    0
 .../{star => nlp/space_T_en}/fields/parse.py  |    0
 .../space_T_en}/fields/preprocess_dataset.py  |    2 +-
 .../space_T_en}/fields/process_dataset.py     |    5 -
 .../nlp/text2text_generation_preprocessor.py  |   40 +
 .../nlp/text_error_correction.py              |    5 +-
 .../nlp/text_generation_jieba_preprocessor.py |   44 +
 .../nlp/text_generation_preprocessor.py       |   62 +
 .../nlp/text_ranking_preprocessor.py          |   67 +
 .../nlp/token_classification_preprocessor.py  |  261 +++
 .../zero_shot_classification_reprocessor.py   |   51 +
 .../preprocessors/space/fields/__init__.py    |    2 -
 .../space/fields/dst_processors.py            | 1523 -------------
 modelscope/trainers/__init__.py               |    5 +-
 modelscope/trainers/default_config.py         |    3 +-
 .../trainers/hooks/lr_scheduler_hook.py       |    3 +-
 modelscope/trainers/hooks/optimizer/base.py   |    1 +
 modelscope/trainers/nlp/__init__.py           |    2 +-
 .../nlp/space/dialog_intent_trainer.py        |   73 +-
 .../nlp/space/dialog_modeling_trainer.py      |    3 +-
 .../trainers/nlp/space/trainer/gen_trainer.py |    9 +-
 .../nlp/space/trainer/intent_trainer.py       |  151 +-
 .../trainers/nlp/text_ranking_trainer.py      |   14 +-
 modelscope/trainers/nlp_trainer.py            |  491 ++++-
 modelscope/trainers/trainer.py                |   74 +-
 modelscope/utils/checkpoint.py                |    2 +-
 modelscope/utils/constant.py                  |    1 -
 modelscope/utils/hub.py                       |   10 +-
 modelscope/utils/nlp/space/args.py            |    4 +-
 modelscope/utils/nlp/space/clean_dataset.py   |    2 +
 modelscope/utils/nlp/space/criterions.py      |    2 +
 modelscope/utils/nlp/space/db_ops.py          |    2 +
 modelscope/utils/nlp/space/ontology.py        |    2 +
 modelscope/utils/nlp/space/scores.py          |    3 +
 modelscope/utils/nlp/space/utils.py           |    2 +
 modelscope/utils/nlp/space/utils_dst.py       |   26 +
 modelscope/utils/nlp/space_T_en/__init__.py   |    0
 .../nlp/{nlp_utils.py => space_T_en/utils.py} |   24 +-
 modelscope/utils/registry.py                  |    1 +
 modelscope/utils/regress_test_utils.py        |   70 +-
 modelscope/utils/tensor_utils.py              |   14 +-
 ...st_export_sbert_sequence_classification.py |   39 +-
 tests/hub/test_download_dataset.py            |  709 ++++++
 tests/models/test_deberta_v2_backbone.py      |   23 +
 tests/outputs/__init__.py                     |    0
 tests/outputs/test_model_outputs.py           |   30 +
 tests/pipelines/nlp/test_faq.py               |   59 +
 .../test_conversational_text_to_sql.py        |    3 +-
 .../test_dialog_intent_prediction.py          |    9 +-
 tests/pipelines/test_dialog_modeling.py       |   18 +-
 tests/pipelines/test_dialog_state_tracking.py |   16 +-
 .../pipelines/test_faq_question_answering.py  |    6 +-
 tests/pipelines/test_fill_mask.py             |   17 +-
 tests/pipelines/test_nli.py                   |    7 +-
 tests/pipelines/test_part_of_speech.py        |    2 +-
 tests/pipelines/test_sentence_embedding.py    |    4 +-
 tests/pipelines/test_sentence_similarity.py   |    5 +-
 .../test_sentiment_classification.py          |    5 +-
 .../test_table_question_answering.py          |    2 +-
 tests/pipelines/test_text_classification.py   |  100 +
 tests/pipelines/test_text_ranking.py          |    4 +-
 .../test_finetune_sequence_classification.py  |   53 +-
 tests/trainers/test_trainer_with_nlp.py       |    3 +-
 241 files changed, 10587 insertions(+), 11541 deletions(-)
 rename modelscope/models/nlp/T5/{modeling_t5.py => backbone.py} (73%)
 rename modelscope/models/nlp/T5/{configuration_t5.py => configuration.py} (99%)
 delete mode 100644 modelscope/models/nlp/T5/t5_for_text_generation.py
 create mode 100644 modelscope/models/nlp/T5/text2text_generation.py
 delete mode 100644 modelscope/models/nlp/backbones/bert.py
 delete mode 100644 modelscope/models/nlp/backbones/structbert.py
 create mode 100644 modelscope/models/nlp/bart/__init__.py
 rename modelscope/models/nlp/{bart_for_text_error_correction.py => bart/text_error_correction.py} (100%)
 create mode 100755 modelscope/models/nlp/bert/backbone.py
 rename modelscope/models/nlp/bert/{configuration_bert.py => configuration.py} (99%)
 rename modelscope/models/nlp/{bert_for_document_segmentation.py => bert/document_segmentation.py} (99%)
 create mode 100644 modelscope/models/nlp/bert/fill_mask.py
 delete mode 100755 modelscope/models/nlp/bert/modeling_bert.py
 create mode 100644 modelscope/models/nlp/bert/sentence_embedding.py
 create mode 100644 modelscope/models/nlp/bert/text_classification.py
 create mode 100644 modelscope/models/nlp/bert/text_ranking.py
 create mode 100644 modelscope/models/nlp/bert/token_classification.py
 create mode 100644 modelscope/models/nlp/csanmt/__init__.py
 rename modelscope/models/nlp/{csanmt_for_translation.py => csanmt/translation.py} (100%)
 rename modelscope/models/nlp/deberta_v2/{modeling_deberta_v2.py => backbone.py} (64%)
 rename modelscope/models/nlp/deberta_v2/{configuration_deberta_v2.py => configuration.py} (98%)
 create mode 100644 modelscope/models/nlp/deberta_v2/fill_mask.py
 rename modelscope/models/nlp/deberta_v2/{tokenization_deberta_v2.py => tokenization.py} (100%)
 rename modelscope/models/nlp/deberta_v2/{tokenization_deberta_v2_fast.py => tokenization_fast.py} (99%)
 rename modelscope/models/nlp/gpt3/{modeling_gpt3.py => backbone.py} (99%)
 rename modelscope/models/nlp/gpt3/{configuration_gpt3.py => configuration.py} (100%)
 rename modelscope/models/nlp/gpt3/{gpt3_for_text_generation.py => text_generation.py} (100%)
 rename modelscope/models/nlp/gpt3/{tokenizer_gpt3.py => tokenizer.py} (100%)
 rename modelscope/models/nlp/{backbones => gpt_neo}/__init__.py (83%)
 rename modelscope/models/nlp/{backbones/gpt_neo.py => gpt_neo/backbone.py} (74%)
 delete mode 100644 modelscope/models/nlp/masked_language.py
 rename modelscope/models/nlp/palm_v2/{modeling_palm.py => backbone.py} (99%)
 rename modelscope/models/nlp/palm_v2/{configuration_palm.py => configuration.py} (100%)
 rename modelscope/models/nlp/palm_v2/{palm_for_text_generation.py => text_generation.py} (100%)
 rename modelscope/models/nlp/plug/{modeling_plug.py => backbone.py} (99%)
 rename modelscope/models/nlp/plug/{configuration_plug.py => configuration.py} (100%)
 rename modelscope/models/nlp/ponet/{modeling_ponet.py => backbone.py} (55%)
 rename modelscope/models/nlp/ponet/{configuration_ponet.py => configuration.py} (96%)
 create mode 100644 modelscope/models/nlp/ponet/fill_mask.py
 rename modelscope/models/nlp/ponet/{tokenization_ponet.py => tokenization.py} (98%)
 delete mode 100644 modelscope/models/nlp/ponet_for_masked_language.py
 delete mode 100644 modelscope/models/nlp/sentence_embedding.py
 delete mode 100644 modelscope/models/nlp/sequence_classification.py
 rename modelscope/models/nlp/space/{model/configuration_space.py => configuration.py} (100%)
 rename modelscope/models/nlp/space/{space_for_dialog_intent_prediction.py => dialog_intent_prediction.py} (66%)
 rename modelscope/models/nlp/space/{space_for_dialog_modeling.py => dialog_modeling.py} (73%)
 rename modelscope/models/nlp/space/{model/modeling_space.py => dialog_state_tracking.py} (57%)
 delete mode 100644 modelscope/models/nlp/space/space_for_dialog_state_tracking.py
 rename modelscope/models/nlp/space_T_cn/{modeling_space_T_cn.py => backbone.py} (99%)
 rename modelscope/models/nlp/space_T_cn/{configuration_space_T_cn.py => configuration.py} (100%)
 rename modelscope/models/nlp/{ => space_T_cn}/table_question_answering.py (94%)
 create mode 100644 modelscope/models/nlp/space_T_en/__init__.py
 rename modelscope/models/nlp/{star_text_to_sql.py => space_T_en/text_to_sql.py} (59%)
 create mode 100755 modelscope/models/nlp/structbert/backbone.py
 rename modelscope/models/nlp/structbert/{configuration_sbert.py => configuration.py} (94%)
 rename modelscope/models/nlp/{sbert_for_faq_question_answering.py => structbert/faq_question_answering.py} (74%)
 create mode 100644 modelscope/models/nlp/structbert/fill_mask.py
 delete mode 100755 modelscope/models/nlp/structbert/modeling_sbert.py
 create mode 100644 modelscope/models/nlp/structbert/text_classification.py
 create mode 100644 modelscope/models/nlp/structbert/token_classification.py
 rename modelscope/models/nlp/structbert/{tokenization_sbert.py => tokenization.py} (100%)
 rename modelscope/models/nlp/structbert/{tokenization_sbert_fast.py => tokenization_fast.py} (99%)
 rename modelscope/models/nlp/{ => task_models}/nncrf_for_named_entity_recognition.py (83%)
 delete mode 100644 modelscope/models/nlp/text_ranking.py
 delete mode 100644 modelscope/models/nlp/token_classification.py
 create mode 100644 modelscope/models/nlp/veco/backbone.py
 rename modelscope/models/nlp/veco/{configuration_veco.py => configuration.py} (100%)
 create mode 100644 modelscope/models/nlp/veco/fill_mask.py
 delete mode 100644 modelscope/models/nlp/veco/modeling_veco.py
 create mode 100644 modelscope/models/nlp/veco/text_classification.py
 create mode 100644 modelscope/models/nlp/veco/token_classification.py
 rename modelscope/models/nlp/veco/{tokenization_veco.py => tokenization.py} (100%)
 rename modelscope/models/nlp/veco/{tokenization_veco_fast.py => tokenization_fast.py} (99%)
 create mode 100644 modelscope/outputs/__init__.py
 rename modelscope/{preprocessors/space_T_cn/fields => outputs/nlp}/__init__.py (100%)
 create mode 100644 modelscope/outputs/nlp/model_outputs.py
 rename modelscope/{ => outputs}/outputs.py (93%)
 delete mode 100644 modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
 delete mode 100644 modelscope/pipelines/nlp/sequence_classification_pipeline.py
 create mode 100644 modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py
 create mode 100644 modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/fill_mask_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
 rename modelscope/preprocessors/{ => nlp}/space/__init__.py (100%)
 rename modelscope/preprocessors/{ => nlp}/space/args.py (97%)
 rename modelscope/preprocessors/{ => nlp}/space/batch.py (96%)
 rename modelscope/preprocessors/{ => nlp}/space/data_loader.py (87%)
 rename modelscope/preprocessors/{ => nlp}/space/dialog_intent_prediction_preprocessor.py (64%)
 rename modelscope/preprocessors/{ => nlp}/space/dialog_modeling_preprocessor.py (75%)
 rename modelscope/preprocessors/{ => nlp}/space/dialog_state_tracking_preprocessor.py (92%)
 rename modelscope/preprocessors/{ => nlp}/space/dst_processors.py (100%)
 create mode 100644 modelscope/preprocessors/nlp/space/fields/__init__.py
 rename modelscope/preprocessors/{ => nlp}/space/fields/gen_field.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space/fields/intent_field.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space/lazy_dataset.py (93%)
 rename modelscope/preprocessors/{ => nlp}/space/preprocess.py (92%)
 rename modelscope/preprocessors/{ => nlp}/space/sampler.py (96%)
 rename modelscope/preprocessors/{ => nlp}/space/tensorlistdataset.py (100%)
 rename modelscope/preprocessors/{ => nlp}/space/tokenizer.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/__init__.py (100%)
 create mode 100644 modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/fields/database.py (98%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/fields/schema_link.py (99%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/fields/struct.py (100%)
 rename modelscope/preprocessors/{ => nlp}/space_T_cn/table_question_answering_preprocessor.py (96%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/__init__.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/conversational_text_to_sql_preprocessor.py (84%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/__init__.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/common_utils.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/parse.py (100%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/preprocess_dataset.py (95%)
 rename modelscope/preprocessors/{star => nlp/space_T_en}/fields/process_dataset.py (94%)
 create mode 100644 modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/text_generation_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/text_ranking_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/token_classification_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
 delete mode 100644 modelscope/preprocessors/space/fields/__init__.py
 delete mode 100644 modelscope/preprocessors/space/fields/dst_processors.py
 create mode 100644 modelscope/utils/nlp/space_T_en/__init__.py
 rename modelscope/utils/nlp/{nlp_utils.py => space_T_en/utils.py} (52%)
 create mode 100644 tests/hub/test_download_dataset.py
 create mode 100644 tests/models/test_deberta_v2_backbone.py
 create mode 100644 tests/outputs/__init__.py
 create mode 100644 tests/outputs/test_model_outputs.py
 create mode 100644 tests/pipelines/nlp/test_faq.py
 create mode 100644 tests/pipelines/test_text_classification.py

diff --git a/data/test/regression/fill_mask_sbert_zh.bin b/data/test/regression/fill_mask_sbert_zh.bin
index 812f7ba2..62581a26 100644
--- a/data/test/regression/fill_mask_sbert_zh.bin
+++ b/data/test/regression/fill_mask_sbert_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
-size 119940
+oid sha256:4eae921001139d7e3c06331c9ef2213f8fc1c23512acd95751559866fb770e96
+size 121855
diff --git a/data/test/regression/fill_mask_veco_en.bin b/data/test/regression/fill_mask_veco_en.bin
index be3fddc8..4d2dba7d 100644
--- a/data/test/regression/fill_mask_veco_en.bin
+++ b/data/test/regression/fill_mask_veco_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
-size 119619
+oid sha256:f97d34d7450d17d0a93647129ab10d16b1f6e70c34a73b6f7687b79519ee4f71
+size 121563
diff --git a/data/test/regression/fill_mask_veco_zh.bin b/data/test/regression/fill_mask_veco_zh.bin
index c0d27e20..a6eb5621 100644
--- a/data/test/regression/fill_mask_veco_zh.bin
+++ b/data/test/regression/fill_mask_veco_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
-size 119619
+oid sha256:a8355f27a3235209f206b5e75f4400353e5989e94cf4d71270b42ded8821d536
+size 121563
diff --git a/data/test/regression/sbert-base-tnews.bin b/data/test/regression/sbert-base-tnews.bin
index 1546860f..d2c63ab0 100644
--- a/data/test/regression/sbert-base-tnews.bin
+++ b/data/test/regression/sbert-base-tnews.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2bce1341f4b55d536771dad6e2b280458579f46c3216474ceb8a926022ab53d0
-size 151572
+oid sha256:344ef971bdf310b76c6571d1f4994ab6abc5edc659654d71a4f75b14a30960c2
+size 152926
diff --git a/data/test/regression/sbert_nli.bin b/data/test/regression/sbert_nli.bin
index 68efb778..52e31692 100644
--- a/data/test/regression/sbert_nli.bin
+++ b/data/test/regression/sbert_nli.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6af5024a26337a440c7ea2935fce84af558dd982ee97a2f027bb922cc874292b
-size 61741
+oid sha256:f0aeb07b6c9b40a0cfa7492e839431764e9bece93c906833a07c05e83520a399
+size 63161
diff --git a/data/test/regression/sbert_sen_sim.bin b/data/test/regression/sbert_sen_sim.bin
index 362f762c..1c8efb81 100644
--- a/data/test/regression/sbert_sen_sim.bin
+++ b/data/test/regression/sbert_sen_sim.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbce084781342ca7274c2e4d02ed5c5de43ba213a3b76328d5994404d6544c41
-size 61745
+oid sha256:7aa5c7a2565ccf0d2eea4baf8adbd0e020dbe36a7159b31156c53141cc9b2df2
+size 63165
diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin
index 6e441f7f..3ad45356 100644
--- a/data/test/regression/sbert_ws_en.bin
+++ b/data/test/regression/sbert_ws_en.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
-size 61239
+oid sha256:cc6de82a8485fbfa008f6c2d5411cd07ba03e4a780bcb4e67efc6fba3c6ce92f
+size 63597
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
index b1841351..a85d787f 100644
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
-size 61115
+oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030
+size 63349
diff --git a/data/test/regression/sbert_zero_shot.bin b/data/test/regression/sbert_zero_shot.bin
index 23d40946..04171523 100644
--- a/data/test/regression/sbert_zero_shot.bin
+++ b/data/test/regression/sbert_zero_shot.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
-size 61589
+oid sha256:01f9b9bf6f8bbf9bb377d4cb6f399b2e5e065381f5b7332343e0db7b4fae72a5
+size 62519
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
index f19d2bbb..c8b7900e 100644
--- a/modelscope/exporters/base.py
+++ b/modelscope/exporters/base.py
@@ -19,10 +19,13 @@ class Exporter(ABC):
     def from_model(cls, model: Model, **kwargs):
         """Build the Exporter instance.
 
-        @param model: A model instance. it will be used to output the generated file,
+        Args:
+            model: A Model instance. it will be used to generate the intermediate format file,
             and the configuration.json in its model_dir field will be used to create the exporter instance.
-        @param kwargs: Extra kwargs used to create the Exporter instance.
-        @return: The Exporter instance
+            kwargs: Extra kwargs used to create the Exporter instance.
+
+        Returns:
+            The Exporter instance
         """
         cfg = Config.from_file(
             os.path.join(model.model_dir, ModelFile.CONFIGURATION))
@@ -44,10 +47,13 @@ class Exporter(ABC):
         In some cases,  several files may be generated,
         So please return a dict which contains the generated name with the file path.
 
-        @param opset: The version of the ONNX operator set to use.
-        @param outputs: The output dir.
-        @param kwargs: In this default implementation,
-            kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
-        @return: A dict contains the model name with the model file path.
+        Args:
+            opset: The version of the ONNX operator set to use.
+            outputs: The output dir.
+            kwargs: In this default implementation,
+                kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
+
+        Returns:
+            A dict contains the model name with the model file path.
         """
         pass
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
index 52dab4bc..7cee331b 100644
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -27,11 +27,14 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
                               **kwargs) -> Dict[str, Any]:
         """Generate dummy inputs for model exportation to onnx or other formats by tracing.
 
-        @param shape: A tuple of input shape which should have at most two dimensions.
-            shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
-            shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
-        @param pair: Generate sentence pairs or single sentences for dummy inputs.
-        @return: Dummy inputs.
+        Args:
+            shape: A tuple of input shape which should have at most two dimensions.
+                shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
+                shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
+            pair(bool, `optional`): Whether to generate sentence pairs or single sentences.
+
+        Returns:
+            Dummy inputs.
         """
 
         cfg = Config.from_file(
diff --git a/modelscope/exporters/torch_model_exporter.py b/modelscope/exporters/torch_model_exporter.py
index 98a23fe5..94ef277a 100644
--- a/modelscope/exporters/torch_model_exporter.py
+++ b/modelscope/exporters/torch_model_exporter.py
@@ -13,8 +13,8 @@ from modelscope.models import TorchModel
 from modelscope.pipelines.base import collate_fn
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
-from modelscope.utils.regress_test_utils import compare_arguments_nested
-from modelscope.utils.tensor_utils import torch_nested_numpify
+from modelscope.utils.regress_test_utils import (compare_arguments_nested,
+                                                 numpify_tensor_nested)
 from .base import Exporter
 
 logger = get_logger(__name__)
@@ -28,49 +28,61 @@ class TorchModelExporter(Exporter):
     and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
     """
 
-    def export_onnx(self, outputs: str, opset=11, **kwargs):
+    def export_onnx(self, output_dir: str, opset=13, **kwargs):
         """Export the model as onnx format files.
 
         In some cases,  several files may be generated,
         So please return a dict which contains the generated name with the file path.
 
-        @param opset: The version of the ONNX operator set to use.
-        @param outputs: The output dir.
-        @param kwargs: In this default implementation,
-            you can pass the arguments needed by _torch_export_onnx, other unrecognized args
-            will be carried to generate_dummy_inputs as extra arguments (such as input shape).
-        @return: A dict containing the model key - model file path pairs.
+        Args:
+            opset: The version of the ONNX operator set to use.
+            output_dir: The output dir.
+            kwargs:
+                model: A model instance which will replace the exporting of self.model.
+                In this default implementation,
+                you can pass the arguments needed by _torch_export_onnx, other unrecognized args
+                will be carried to generate_dummy_inputs as extra arguments (such as input shape).
+
+        Returns:
+            A dict containing the model key - model file path pairs.
         """
-        model = self.model
+        model = self.model if 'model' not in kwargs else kwargs.pop('model')
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             model = model.model
-        onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
+        onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
         self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
         return {'model': onnx_file}
 
-    def export_torch_script(self, outputs: str, **kwargs):
+    def export_torch_script(self, output_dir: str, **kwargs):
         """Export the model as torch script files.
 
         In some cases,  several files may be generated,
         So please return a dict which contains the generated name with the file path.
 
-        @param outputs: The output dir.
-        @param kwargs: In this default implementation,
+        Args:
+            output_dir: The output dir.
+            kwargs:
+            model: A model instance which will replace the exporting of self.model.
+            In this default implementation,
             you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
             will be carried to generate_dummy_inputs as extra arguments (like input shape).
-        @return: A dict contains the model name with the model file path.
+
+        Returns:
+            A dict contains the model name with the model file path.
         """
-        model = self.model
+        model = self.model if 'model' not in kwargs else kwargs.pop('model')
         if not isinstance(model, nn.Module) and hasattr(model, 'model'):
             model = model.model
-        ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
+        ts_file = os.path.join(output_dir, ModelFile.TS_MODEL_FILE)
         # generate ts by tracing
         self._torch_export_torch_script(model, ts_file, **kwargs)
         return {'model': ts_file}
 
     def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
         """Generate dummy inputs for model exportation to onnx or other formats by tracing.
-        @return: Dummy inputs.
+
+        Returns:
+            Dummy inputs.
         """
         return None
 
@@ -93,7 +105,7 @@ class TorchModelExporter(Exporter):
     def _torch_export_onnx(self,
                            model: nn.Module,
                            output: str,
-                           opset: int = 11,
+                           opset: int = 13,
                            device: str = 'cpu',
                            validation: bool = True,
                            rtol: float = None,
@@ -101,18 +113,27 @@ class TorchModelExporter(Exporter):
                            **kwargs):
         """Export the model to an onnx format file.
 
-        @param model: A torch.nn.Module instance to export.
-        @param output: The output file.
-        @param opset: The version of the ONNX operator set to use.
-        @param device: The device used to forward.
-        @param validation: Whether validate the export file.
-        @param rtol: The rtol used to regress the outputs.
-        @param atol: The atol used to regress the outputs.
+        Args:
+            model: A torch.nn.Module instance to export.
+            output: The output file.
+            opset: The version of the ONNX operator set to use.
+            device: The device used to forward.
+            validation: Whether validate the export file.
+            rtol: The rtol used to regress the outputs.
+            atol: The atol used to regress the outputs.
+            kwargs:
+                dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs().
+                inputs: An inputs structure which will replace the calling of self.inputs.
+                outputs: An outputs structure which will replace the calling of self.outputs.
         """
 
-        dummy_inputs = self.generate_dummy_inputs(**kwargs)
-        inputs = self.inputs
-        outputs = self.outputs
+        dummy_inputs = self.generate_dummy_inputs(
+            **kwargs) if 'dummy_inputs' not in kwargs else kwargs.pop(
+                'dummy_inputs')
+        inputs = self.inputs if 'inputs' not in kwargs else kwargs.pop(
+            'inputs')
+        outputs = self.outputs if 'outputs' not in kwargs else kwargs.pop(
+            'outputs')
         if dummy_inputs is None or inputs is None or outputs is None:
             raise NotImplementedError(
                 'Model property dummy_inputs,inputs,outputs must be set.')
@@ -125,7 +146,7 @@ class TorchModelExporter(Exporter):
 
             if isinstance(dummy_inputs, Mapping):
                 dummy_inputs = dict(dummy_inputs)
-            onnx_outputs = list(self.outputs.keys())
+            onnx_outputs = list(outputs.keys())
 
             with replace_call():
                 onnx_export(
@@ -160,11 +181,13 @@ class TorchModelExporter(Exporter):
                 outputs_origin = model.forward(
                     *_decide_input_format(model, dummy_inputs))
             if isinstance(outputs_origin, Mapping):
-                outputs_origin = torch_nested_numpify(
+                outputs_origin = numpify_tensor_nested(
                     list(outputs_origin.values()))
+            elif isinstance(outputs_origin, (tuple, list)):
+                outputs_origin = numpify_tensor_nested(outputs_origin)
             outputs = ort_session.run(
                 onnx_outputs,
-                torch_nested_numpify(dummy_inputs),
+                numpify_tensor_nested(dummy_inputs),
             )
 
             tols = {}
@@ -184,19 +207,26 @@ class TorchModelExporter(Exporter):
                                    validation: bool = True,
                                    rtol: float = None,
                                    atol: float = None,
+                                   strict: bool = True,
                                    **kwargs):
         """Export the model to a torch script file.
 
-        @param model: A torch.nn.Module instance to export.
-        @param output: The output file.
-        @param device: The device used to forward.
-        @param validation: Whether validate the export file.
-        @param rtol: The rtol used to regress the outputs.
-        @param atol: The atol used to regress the outputs.
+        Args:
+            model: A torch.nn.Module instance to export.
+            output: The output file.
+            device: The device used to forward.
+            validation: Whether validate the export file.
+            rtol: The rtol used to regress the outputs.
+            atol: The atol used to regress the outputs.
+            strict: strict mode in torch script tracing.
+            kwargs:
+                dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs().
         """
 
         model.eval()
-        dummy_inputs = self.generate_dummy_inputs(**kwargs)
+        dummy_param = 'dummy_inputs' not in kwargs
+        dummy_inputs = self.generate_dummy_inputs(
+            **kwargs) if dummy_param else kwargs.pop('dummy_inputs')
         if dummy_inputs is None:
             raise NotImplementedError(
                 'Model property dummy_inputs must be set.')
@@ -207,7 +237,7 @@ class TorchModelExporter(Exporter):
             model.eval()
             with replace_call():
                 traced_model = torch.jit.trace(
-                    model, dummy_inputs, strict=False)
+                    model, dummy_inputs, strict=strict)
         torch.jit.save(traced_model, output)
 
         if validation:
@@ -216,9 +246,9 @@ class TorchModelExporter(Exporter):
                 model.eval()
                 ts_model.eval()
                 outputs = ts_model.forward(*dummy_inputs)
-                outputs = torch_nested_numpify(outputs)
+                outputs = numpify_tensor_nested(outputs)
                 outputs_origin = model.forward(*dummy_inputs)
-                outputs_origin = torch_nested_numpify(outputs_origin)
+                outputs_origin = numpify_tensor_nested(outputs_origin)
             tols = {}
             if rtol is not None:
                 tols['rtol'] = rtol
@@ -240,7 +270,6 @@ def replace_call():
     problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
     back after the tracing was done.
     """
-
     TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
     yield
     TorchModel.__call__ = TorchModel.call_origin
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 913589d8..01b08699 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -69,7 +69,6 @@ class Models(object):
     space_modeling = 'space-modeling'
     space_T_en = 'space-T-en'
     space_T_cn = 'space-T-cn'
-
     tcrf = 'transformer-crf'
     transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py
index 3a9d810f..1b9db825 100644
--- a/modelscope/metrics/base.py
+++ b/modelscope/metrics/base.py
@@ -10,9 +10,6 @@ class Metric(ABC):
     complex metrics for a specific task with or without other Metric subclasses.
     """
 
-    def __init__(self, trainer=None, *args, **kwargs):
-        self.trainer = trainer
-
     @abstractmethod
     def add(self, outputs: Dict, inputs: Dict):
         """ Append logits and labels within an eval loop.
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index 05b72170..f8595fc1 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -34,17 +34,24 @@ class TokenClassificationMetric(Metric):
         self.labels.append(
             torch_nested_numpify(torch_nested_detach(ground_truths)))
 
-    def __init__(self, return_entity_level_metrics=False, *args, **kwargs):
+    def __init__(self,
+                 return_entity_level_metrics=False,
+                 label2id=None,
+                 *args,
+                 **kwargs):
         super().__init__(*args, **kwargs)
         self.return_entity_level_metrics = return_entity_level_metrics
         self.preds = []
         self.labels = []
+        self.label2id = label2id
 
     def evaluate(self):
-        self.id2label = {
-            id: label
-            for label, id in self.trainer.label2id.items()
-        }
+        label2id = self.label2id
+        if label2id is None:
+            assert hasattr(self, 'trainer')
+            label2id = self.trainer.label2id
+
+        self.id2label = {id: label for label, id in label2id.items()}
         self.preds = np.concatenate(self.preds, axis=0)
         self.labels = np.concatenate(self.labels, axis=0)
         predictions = np.argmax(self.preds, axis=-1)
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index cdc71fcf..1246551e 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -5,11 +5,11 @@ from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.models.builder import build_model
-from modelscope.utils.checkpoint import save_pretrained
+from modelscope.models.builder import MODELS, build_model
+from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
 from modelscope.utils.config import Config
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
-from modelscope.utils.device import device_placement, verify_device
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks
+from modelscope.utils.device import verify_device
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -66,7 +66,6 @@ class Model(ABC):
                         revision: Optional[str] = DEFAULT_MODEL_REVISION,
                         cfg_dict: Config = None,
                         device: str = None,
-                        *model_args,
                         **kwargs):
         """ Instantiate a model from local directory or remote model repo. Note
         that when loading from remote, the model revision can be specified.
@@ -90,11 +89,11 @@ class Model(ABC):
             cfg = Config.from_file(
                 osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
+        if 'task' in kwargs:
+            task_name = kwargs.pop('task')
         model_cfg = cfg.model
-
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
-
         model_cfg.model_dir = local_model_dir
         for k, v in kwargs.items():
             model_cfg[k] = v
@@ -109,15 +108,19 @@ class Model(ABC):
         # dynamically add pipeline info to model for pipeline inference
         if hasattr(cfg, 'pipeline'):
             model.pipeline = cfg.pipeline
+
+        if not hasattr(model, 'cfg'):
+            model.cfg = cfg
         return model
 
     def save_pretrained(self,
                         target_folder: Union[str, os.PathLike],
                         save_checkpoint_names: Union[str, List[str]] = None,
-                        save_function: Callable = None,
+                        save_function: Callable = save_checkpoint,
                         config: Optional[dict] = None,
                         **kwargs):
-        """save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
+        """save the pretrained model, its configuration and other related files to a directory,
+            so that it can be re-loaded
 
         Args:
             target_folder (Union[str, os.PathLike]):
@@ -133,5 +136,10 @@ class Model(ABC):
             The config for the configuration.json, might not be identical with model.config
 
         """
+        if config is None and hasattr(self, 'cfg'):
+            config = self.cfg
+        assert config is not None, 'Cannot save the model because the model config is empty.'
+        if isinstance(config, Config):
+            config = config.to_dict()
         save_pretrained(self, target_folder, save_checkpoint_names,
                         save_function, config, **kwargs)
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index 7a8e28f4..a35358c1 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -1,10 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 from modelscope.utils.config import ConfigDict
+from modelscope.utils.constant import Tasks
 from modelscope.utils.registry import TYPE_NAME, Registry, build_from_cfg
 
 MODELS = Registry('models')
 BACKBONES = Registry('backbones')
+BACKBONES._modules = MODELS._modules
 HEADS = Registry('heads')
 
 
@@ -23,30 +25,27 @@ def build_model(cfg: ConfigDict,
         cfg, MODELS, group_key=task_name, default_args=default_args)
 
 
-def build_backbone(cfg: ConfigDict,
-                   field: str = None,
-                   default_args: dict = None):
+def build_backbone(cfg: ConfigDict, default_args: dict = None):
     """ build backbone given backbone config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for backbone object.
-        field (str, optional): field, such as CV, NLP's backbone
         default_args (dict, optional): Default initialization arguments.
     """
     return build_from_cfg(
-        cfg, BACKBONES, group_key=field, default_args=default_args)
+        cfg, BACKBONES, group_key=Tasks.backbone, default_args=default_args)
 
 
 def build_head(cfg: ConfigDict,
-               group_key: str = None,
+               task_name: str = None,
                default_args: dict = None):
     """ build head given config dict
 
     Args:
         cfg (:obj:`ConfigDict`): config dict for head object.
+        task_name (str, optional):  task name, refer to
+            :obj:`Tasks` for more details
         default_args (dict, optional): Default initialization arguments.
     """
-    if group_key is None:
-        group_key = cfg[TYPE_NAME]
     return build_from_cfg(
-        cfg, HEADS, group_key=group_key, default_args=default_args)
+        cfg, HEADS, group_key=task_name, default_args=default_args)
diff --git a/modelscope/models/nlp/T5/__init__.py b/modelscope/models/nlp/T5/__init__.py
index 7c1cea36..cb0921c6 100644
--- a/modelscope/models/nlp/T5/__init__.py
+++ b/modelscope/models/nlp/T5/__init__.py
@@ -1,13 +1,17 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .t5_for_text_generation import T5ForConditionalGeneration
+    from .backbone import T5Model
+    from .text2text_generation import T5ForConditionalGeneration
 
 else:
     _import_structure = {
-        't5_for_text_generation': ['T5ForConditionalGeneration'],
+        'backbone': ['T5Model'],
+        'text2text_generation': ['T5ForConditionalGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/T5/modeling_t5.py b/modelscope/models/nlp/T5/backbone.py
similarity index 73%
rename from modelscope/models/nlp/T5/modeling_t5.py
rename to modelscope/models/nlp/T5/backbone.py
index da50741e..9a46d980 100644
--- a/modelscope/models/nlp/T5/modeling_t5.py
+++ b/modelscope/models/nlp/T5/backbone.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,12 +22,8 @@ from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from torch.utils.checkpoint import checkpoint
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput, Seq2SeqModelOutput)
 from transformers.modeling_utils import (PreTrainedModel,
                                          find_pruneable_heads_and_indices,
                                          prune_linear_layer)
@@ -36,30 +33,20 @@ from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings,
 from transformers.utils.model_parallel_utils import (assert_device_map,
                                                      get_device_map)
 
+from modelscope.metainfo import Models
+from modelscope.models.base import Model, Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import (BaseModelOutput,
+                                BaseModelOutputWithPastAndCrossAttentions,
+                                Seq2SeqModelOutput)
+from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
-from .configuration_t5 import T5Config
+from .configuration import T5Config
 
 logger = get_logger(__name__)
 
-_CONFIG_FOR_DOC = 'T5Config'
-_TOKENIZER_FOR_DOC = 'T5Tokenizer'
-_CHECKPOINT_FOR_DOC = 't5-small'
 
-####################################################
-# This dict contains ids and associated url
-# for the pretrained weights provided with the models
-####################################################
-T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    't5-small',
-    't5-base',
-    't5-large',
-    't5-3b',
-    't5-11b',
-    # See all T5 models at https://huggingface.co/models?filter=t5
-]
-
-
-####################################################
+###################################################
 # This is a conversion method from TF 1.0 to PyTorch
 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
 ####################################################
@@ -173,65 +160,6 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
     return model
 
 
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
-####################################################
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
-            following number of attention modules:
-
-                - t5-small: 6
-                - t5-base: 12
-                - t5-large: 24
-                - t5-3b: 24
-                - t5-11b: 24
-
-    Example:
-
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs
-    # using t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example:
-
-    ```python
-    # On a 4 GPU machine with t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
 class T5LayerNorm(nn.Module):
 
     def __init__(self, hidden_size, eps=1e-6):
@@ -261,23 +189,6 @@ class T5LayerNorm(nn.Module):
         return self.weight * hidden_states
 
 
-try:
-    from apex.normalization import FusedRMSNorm
-
-    T5LayerNorm = FusedRMSNorm  # noqa
-
-    logger.info(
-        'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm'
-    )
-except ImportError:
-    # using the normal T5LayerNorm
-    pass
-except Exception:
-    logger.warning(
-        'discovered apex but it failed to load, falling back to T5LayerNorm')
-    pass
-
-
 class T5DenseReluDense(nn.Module):
 
     def __init__(self, config: T5Config):
@@ -791,7 +702,7 @@ class T5Block(nn.Module):
         return outputs
 
 
-class T5PreTrainedModel(PreTrainedModel):
+class T5PreTrainedModel(TorchModel, PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface
     for downloading and loading pretrained models.
@@ -803,6 +714,10 @@ class T5PreTrainedModel(PreTrainedModel):
     is_parallelizable = True
     supports_gradient_checkpointing = True
 
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
     @property
     def dummy_inputs(self):
         input_ids = torch.tensor(DUMMY_INPUTS)
@@ -819,8 +734,7 @@ class T5PreTrainedModel(PreTrainedModel):
         factor = self.config.initializer_factor  # Used for testing weights initialization
         if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module,
-                        (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+        elif isinstance(module, T5Model):
             # Mesh TensorFlow embeddings initialization See
             # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
@@ -902,6 +816,36 @@ class T5PreTrainedModel(PreTrainedModel):
 
         return shifted_input_ids
 
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the
+                    label information. num_labels: An optional arg to tell the
+                    model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping
+                                    if num_labels not supplied. If num_labels is
+                                    not found, the model will use the default
+                                    setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by
+            transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir', None)
+        if model_dir is None:
+            config = T5Config(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        model.model_dir = model_dir
+        return model
+
 
 class T5Stack(T5PreTrainedModel):
 
@@ -926,8 +870,42 @@ class T5Stack(T5PreTrainedModel):
         self.device_map = None
         self.gradient_checkpointing = False
 
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def parallelize(self, device_map=None):
+        r"""
+            This is an experimental feature and is a subject to change at a
+            moment's notice.
+
+            Uses a device map to distribute attention modules of the model
+            across several devices. If no device map is given, it will evenly
+            distribute blocks across all devices.
+
+            Args:
+                device_map (`Dict[int, list]`, optional, defaults to None):
+                    A dictionary that maps attention modules to devices. Note
+                    that the embedding module and LMHead are always
+                    automatically mapped to the first device (for esoteric
+                    reasons). That means that the first device should have fewer
+                    attention modules mapped to it than other devices. For
+                    reference, the t5 models have the following number of
+                    attention modules:
+
+                        - t5-small: 6
+                        - t5-base: 12
+                        - t5-large: 24
+                        - t5-3b: 24
+                        - t5-11b: 24
+
+            Example:
+
+            ```python # Here is an example of a device map on a machine with 4
+            GPUs # using t5-3b, which has a total of 24 attention modules: model
+            = T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
+                0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
+                15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
+            } model.parallelize(device_map) ``` all of the parallelize methods
+            in this file are the same
+
+        """
         # Check validity of device_map
         self.device_map = (
             get_device_map(len(self.block), range(torch.cuda.device_count()))
@@ -948,8 +926,22 @@ class T5Stack(T5PreTrainedModel):
         # Set final layer norm to last device
         self.final_layer_norm = self.final_layer_norm.to(self.last_device)
 
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
     def deparallelize(self):
+        r"""
+            Moves the model to cpu from a model parallel state.
+
+            Example:
+
+            ```python # On a 4 GPU machine with t5-3b: model =
+            T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
+                0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
+                15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
+            } model.parallelize(device_map)  # Splits the model across several
+            devices model.deparallelize()  # Put the model back on cpu and
+            cleans memory by calling torch.cuda.empty_cache() ```
+
+            all of the deparallelize methods in this file are the same
+        """
         self.model_parallel = False
         self.device_map = None
         self.first_device = 'cpu'
@@ -1199,7 +1191,20 @@ class T5Stack(T5PreTrainedModel):
         )
 
 
-T5_START_DOCSTRING = r"""
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.T5)
+class T5Model(T5PreTrainedModel):
+    """The bare T5 Model transformer outputting raw hidden-states without any
+    specific head on top.
 
     The T5 model was proposed in [Exploring the Limits of Transfer Learning with
     a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
@@ -1224,10 +1229,99 @@ T5_START_DOCSTRING = r"""
             with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model
             weights.
-"""
+    """
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
 
-T5_INPUTS_DOCSTRING = r"""
-    Args:
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
+        heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. T5 is a model
             with relative position embeddings so you should be able to pad the
@@ -1343,244 +1437,84 @@ T5_INPUTS_DOCSTRING = r"""
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain
             tuple.
-"""
+        Returns:
 
-T5_ENCODER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model
-            with relative position embeddings so you should be able to pad the
-            inputs on both the right and the left.
+        Example:
 
-            Indices can be obtained using [`T5Tokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for detail.
+        ```python >>> from transformers import T5Tokenizer, T5Model
 
-            To know more on how to prepare `input_ids` for pretraining take a
-            look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5Model.from_pretrained("t5-small")
 
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
 
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in `[0, 1]`:
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
 
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
-            tuple.
-"""
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
 
-# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-__HEAD_MASK_WARNING_MSG = """
-The input argument `head_mask` was split into two arguments `head_mask` and
-`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
-but this feature is deprecated and will be removed in future versions. If you do
-not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
-torch.ones(num_layers, num_heads)`.
-"""
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
 
-
-@add_start_docstrings(
-    'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
-    T5_START_DOCSTRING,
-)
-class T5Model(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r'encoder\.embed_tokens\.weight',
-        r'decoder\.embed_tokens\.weight',
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(
-                len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None else device_map)
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to('cpu')
-        self.decoder = self.decoder.to('cpu')
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
-        heads to prune in this layer} See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        decoder_inputs_embeds: Optional[torch.Tensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
-        r"""
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import T5Tokenizer, T5Model
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5Model.from_pretrained("t5-small")
-
-        >>> input_ids = tokenizer(
-        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
-        >>> ).input_ids  # Batch size 1
-        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-
-        >>> # forward pass
-        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1]
-                if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2]
-                if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(
-                    self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(
-                    self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
 
         if not return_dict:
             return decoder_outputs + encoder_outputs
@@ -1595,409 +1529,3 @@ class T5Model(T5PreTrainedModel):
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
         )
-
-
-@add_start_docstrings("""T5 Model with a `language modeling` head on top.""",
-                      T5_START_DOCSTRING)
-class T5ForConditionalGeneration(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r'encoder\.embed_tokens\.weight',
-        r'decoder\.embed_tokens\.weight',
-        r'lm_head\.weight',
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.is_decoder = False
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(
-                len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None else device_map)
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.decoder.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to('cpu')
-        self.decoder = self.decoder.to('cpu')
-        self.lm_head = self.lm_head.to('cpu')
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.BoolTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        decoder_head_mask: Optional[torch.FloatTensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
-            labels set to `-100` are ignored (masked), the loss is only computed
-            for labels in `[0, ..., config.vocab_size]`
-
-        Returns:
-
-        Examples:
-
-        ```python >>> from transformers import T5Tokenizer,
-        T5ForConditionalGeneration
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
-
-        >>> # training
-        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
-        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
-        >>> outputs = model(input_ids=input_ids, labels=labels)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-
-        >>> # inference
-        >>> input_ids = tokenizer(
-        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
-        >>> ).input_ids  # Batch size 1
-        >>> outputs = model.generate(input_ids)
-        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-        >>> # studies have shown that owning a dog is good for you.
-        ```"""
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-        if head_mask is not None and decoder_head_mask is None:
-            if self.config.num_layers == self.config.num_decoder_layers:
-                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
-                decoder_head_mask = head_mask
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            # Convert encoder inputs in embeddings if needed
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1]
-                if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2]
-                if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(
-                    self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(
-                    self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = decoder_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.encoder.first_device)
-            self.lm_head = self.lm_head.to(self.encoder.first_device)
-            sequence_output = sequence_output.to(self.lm_head.weight.device)
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab See
-            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.model_dim**-0.5)
-
-        lm_logits = self.lm_head(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(
-                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
-            # TODO(thom): Add z_loss
-            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-        if not return_dict:
-            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
-            return ((loss, ) + output) if loss is not None else output
-
-        return Seq2SeqLMOutput(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      head_mask=None,
-                                      decoder_head_mask=None,
-                                      cross_attn_head_mask=None,
-                                      use_cache=None,
-                                      encoder_outputs=None,
-                                      **kwargs):
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'decoder_input_ids': input_ids,
-            'past_key_values': past,
-            'encoder_outputs': encoder_outputs,
-            'attention_mask': attention_mask,
-            'head_mask': head_mask,
-            'decoder_head_mask': decoder_head_mask,
-            'cross_attn_head_mask': cross_attn_head_mask,
-            'use_cache': use_cache,
-        }
-
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return self._shift_right(labels)
-
-    def _reorder_cache(self, past, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past is None:
-            logger.warning(
-                'You might want to consider setting `use_cache=True` to speed up decoding'
-            )
-            return past
-
-        reordered_decoder_past = ()
-        for layer_past_states in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(
-                        0, beam_idx.to(layer_past_state.device)), )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[
-                0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (
-                reordered_layer_past_states, )
-        return reordered_decoder_past
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
-    T5_START_DOCSTRING,
-)
-class T5EncoderModel(T5PreTrainedModel):
-    authorized_missing_keys = [
-        r'encoder\.embed_tokens\.weight',
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(
-                len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None else device_map)
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.encoder = self.encoder.to('cpu')
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
-        heads to prune in this layer} See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
-        r"""
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import T5Tokenizer, T5EncoderModel
-
-        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
-        >>> model = T5EncoderModel.from_pretrained("t5-small")
-        >>> input_ids = tokenizer(
-        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
-        >>> ).input_ids  # Batch size 1
-        >>> outputs = model(input_ids=input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        encoder_outputs = self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        return encoder_outputs
diff --git a/modelscope/models/nlp/T5/configuration_t5.py b/modelscope/models/nlp/T5/configuration.py
similarity index 99%
rename from modelscope/models/nlp/T5/configuration_t5.py
rename to modelscope/models/nlp/T5/configuration.py
index 117a6bc1..1f9a965e 100644
--- a/modelscope/models/nlp/T5/configuration_t5.py
+++ b/modelscope/models/nlp/T5/configuration.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2020, The T5 Authors and HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/T5/t5_for_text_generation.py b/modelscope/models/nlp/T5/t5_for_text_generation.py
deleted file mode 100644
index 27f077d8..00000000
--- a/modelscope/models/nlp/T5/t5_for_text_generation.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models.base import Tensor, TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from .modeling_t5 import T5Config
-from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration
-
-
-@MODELS.register_module(
-    group_key=Tasks.text2text_generation,
-    module_name=Models.T5,
-)
-class T5ForConditionalGeneration(TorchModel):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        """initialize the text generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-            model_cls (Optional[Any], optional): model loader, if None, use the
-                default loader to load model weights, by default None.
-        """
-        super().__init__(model_dir, *args, **kwargs)
-        self.model = T5ForGeneration.from_pretrained(model_dir)
-        self.generate = self.model.generate
-        self.config = self.model.config
-
-    def forward(self,
-                input_ids: Optional[torch.LongTensor] = None,
-                attention_mask: Optional[torch.FloatTensor] = None,
-                decoder_input_ids: Optional[torch.LongTensor] = None,
-                decoder_attention_mask: Optional[torch.BoolTensor] = None,
-                head_mask: Optional[torch.FloatTensor] = None,
-                decoder_head_mask: Optional[torch.FloatTensor] = None,
-                cross_attn_head_mask: Optional[torch.Tensor] = None,
-                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-                inputs_embeds: Optional[torch.FloatTensor] = None,
-                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-                labels: Optional[torch.LongTensor] = None,
-                use_cache: Optional[bool] = None,
-                output_attentions: Optional[bool] = None,
-                output_hidden_states: Optional[bool] = None,
-                return_dict: Optional[bool] = None,
-                **kwargs):
-        return self.model.forward(
-            self, input_ids, attention_mask, decoder_input_ids,
-            decoder_attention_mask, head_mask, decoder_head_mask,
-            cross_attn_head_mask, encoder_outputs, past_key_values,
-            inputs_embeds, decoder_inputs_embeds, labels, use_cache,
-            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/T5/text2text_generation.py b/modelscope/models/nlp/T5/text2text_generation.py
new file mode 100644
index 00000000..c4dcdfdb
--- /dev/null
+++ b/modelscope/models/nlp/T5/text2text_generation.py
@@ -0,0 +1,455 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.utils.model_parallel_utils import (assert_device_map,
+                                                     get_device_map)
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import BaseModelOutput, Seq2SeqLMOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .backbone import T5PreTrainedModel, T5Stack
+from .configuration import T5Config
+
+logger = get_logger(__name__)
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and
+`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
+but this feature is deprecated and will be removed in future versions. If you do
+not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
+torch.ones(num_layers, num_heads)`.
+"""
+
+
+@MODELS.register_module(
+    group_key=Tasks.text2text_generation,
+    module_name=Models.T5,
+)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r'encoder\.embed_tokens\.weight',
+        r'decoder\.embed_tokens\.weight',
+        r'lm_head\.weight',
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(
+                len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None else device_map)
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to('cpu')
+        self.decoder = self.decoder.to('cpu')
+        self.lm_head = self.lm_head.to('cpu')
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.FloatTensor] = None,
+                decoder_input_ids: Optional[torch.LongTensor] = None,
+                decoder_attention_mask: Optional[torch.BoolTensor] = None,
+                head_mask: Optional[torch.FloatTensor] = None,
+                decoder_head_mask: Optional[torch.FloatTensor] = None,
+                cross_attn_head_mask: Optional[torch.Tensor] = None,
+                encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                **kwargs) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model
+            with relative position embeddings so you should be able to pad the
+            inputs on both the right and the left.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a
+            look a [T5 Training](./t5#training).
+        attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`T5Tokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            T5 uses the `pad_token_id` as the starting token for
+            `decoder_input_ids` generation. If `past_key_values` is used,
+            optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining
+            take a look at [T5 Training](./t5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
+        target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in
+            `decoder_input_ids`. Causal mask will also be used by default.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the
+            decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
+        `(num_layers, num_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in
+                the decoder. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
+            `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
+            sequence_length, hidden_size)` is a sequence of hidden states at the
+            output of the last layer of the encoder. Used in the cross-attention
+            of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
+        target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to
+            directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more
+            control over how to convert `decoder_input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
+            `decoder_inputs_embeds` takes the value of `inputs_embeds`.
+
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain
+            tuple.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
+            labels set to `-100` are ignored (masked), the loss is only computed
+            for labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Examples:
+
+        ```python >>> from transformers import T5Tokenizer,
+        T5ForConditionalGeneration
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        >>> # training
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+
+        >>> # inference
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> # studies have shown that owning a dog is good for you.
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1]
+                if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(
+                    self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(
+                    self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab See
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss
+            # https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
+            return ((loss, ) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      head_mask=None,
+                                      decoder_head_mask=None,
+                                      cross_attn_head_mask=None,
+                                      use_cache=None,
+                                      encoder_outputs=None,
+                                      **kwargs):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'decoder_input_ids': input_ids,
+            'past_key_values': past,
+            'encoder_outputs': encoder_outputs,
+            'attention_mask': attention_mask,
+            'head_mask': head_mask,
+            'decoder_head_mask': decoder_head_mask,
+            'cross_attn_head_mask': cross_attn_head_mask,
+            'use_cache': use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning(
+                'You might want to consider setting `use_cache=True` to speed up decoding'
+            )
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(
+                        0, beam_idx.to(layer_past_state.device)), )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[
+                0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (
+                reordered_layer_past_states, )
+        return reordered_decoder_past
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 57222698..dff42d1c 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -4,80 +4,99 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .backbones import SbertModel
-    from .bart_for_text_error_correction import BartForTextErrorCorrection
-    from .bert_for_document_segmentation import BertForDocumentSegmentation
-    from .csanmt_for_translation import CsanmtForTranslation
+    from .bart import BartForTextErrorCorrection
+    from .csanmt import CsanmtForTranslation
     from .heads import SequenceClassificationHead
     from .gpt3 import GPT3ForTextGeneration
-    from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
-                                  BertForMaskedLM, DebertaV2ForMaskedLM)
-    from .ponet_for_masked_language import PoNetForMaskedLM
-    from .nncrf_for_named_entity_recognition import (
-        TransformerCRFForNamedEntityRecognition,
-        LSTMCRFForNamedEntityRecognition)
     from .palm_v2 import PalmForTextGeneration
-    from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
-    from .star_text_to_sql import StarForTextToSql
-    from .sequence_classification import (VecoForSequenceClassification,
-                                          SbertForSequenceClassification,
-                                          BertForSequenceClassification)
-    from .space import SpaceForDialogIntent
-    from .space import SpaceForDialogModeling
-    from .space import SpaceForDialogStateTracking
-    from .table_question_answering import TableQuestionAnswering
-    from .task_models import (FeatureExtractionModel,
-                              InformationExtractionModel,
-                              SequenceClassificationModel,
-                              SingleBackboneTaskModelBase,
-                              TokenClassificationModel,
-                              TaskModelForTextGeneration)
-    from .token_classification import SbertForTokenClassification
-    from .sentence_embedding import SentenceEmbedding
-    from .text_ranking import TextRanking
-    from .T5 import T5ForConditionalGeneration
+    from .space_T_en import StarForTextToSql
+    from .space_T_cn import TableQuestionAnswering
+    from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
+    from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig
+    from .structbert import (
+        SbertForFaqQuestionAnswering,
+        SbertForMaskedLM,
+        SbertForSequenceClassification,
+        SbertForTokenClassification,
+        SbertTokenizer,
+        SbertTokenizerFast,
+    )
+    from .bert import (
+        BertForMaskedLM,
+        BertForTextRanking,
+        BertForSentenceEmbedding,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForDocumentSegmentation,
+        BertModel,
+        BertConfig,
+    )
+    from .veco import VecoModel, VecoConfig, VecoForTokenClassification, \
+        VecoForSequenceClassification, VecoForMaskedLM, VecoTokenizer, VecoTokenizerFast
+    from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
+    from .task_models import (
+        FeatureExtractionModel,
+        InformationExtractionModel,
+        LSTMCRFForNamedEntityRecognition,
+        SequenceClassificationModel,
+        SingleBackboneTaskModelBase,
+        TaskModelForTextGeneration,
+        TokenClassificationModel,
+        TransformerCRFForNamedEntityRecognition,
+    )
 
+    from .T5 import T5ForConditionalGeneration
+    from .gpt_neo import GPTNeoModel
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
-        'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
-        'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
-        'csanmt_for_translation': ['CsanmtForTranslation'],
+        'bart': ['BartForTextErrorCorrection'],
+        'csanmt': ['CsanmtForTranslation'],
         'heads': ['SequenceClassificationHead'],
         'gpt3': ['GPT3ForTextGeneration'],
-        'masked_language': [
-            'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
-            'DebertaV2ForMaskedLM'
+        'structbert': [
+            'SbertForFaqQuestionAnswering',
+            'SbertForMaskedLM',
+            'SbertForSequenceClassification',
+            'SbertForTokenClassification',
+            'SbertTokenizer',
+            'SbertTokenizerFast',
         ],
-        'nncrf_for_named_entity_recognition': [
-            'TransformerCRFForNamedEntityRecognition',
-            'LSTMCRFForNamedEntityRecognition'
-        ],
-        'ponet_for_masked_language': ['PoNetForMaskedLM'],
-        'palm_v2': ['PalmForTextGeneration'],
-        'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
-        'star_text_to_sql': ['StarForTextToSql'],
-        'sequence_classification': [
-            'VecoForSequenceClassification', 'SbertForSequenceClassification',
-            'BertForSequenceClassification'
+        'veco': [
+            'VecoModel', 'VecoConfig', 'VecoForTokenClassification',
+            'VecoForSequenceClassification', 'VecoForMaskedLM',
+            'VecoTokenizer', 'VecoTokenizerFast'
         ],
-        'space': [
-            'SpaceForDialogIntent', 'SpaceForDialogModeling',
-            'SpaceForDialogStateTracking'
+        'bert': [
+            'BertForMaskedLM',
+            'BertForTextRanking',
+            'BertForSentenceEmbedding',
+            'BertForSequenceClassification',
+            'BertForTokenClassification',
+            'BertForDocumentSegmentation',
+            'BertModel',
+            'BertConfig',
         ],
+        'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'],
+        'palm_v2': ['PalmForTextGeneration'],
+        'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'],
+        'space_T_en': ['StarForTextToSql'],
+        'space_T_cn': ['TableQuestionAnswering'],
+        'space':
+        ['SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDST'],
         'task_models': [
             'FeatureExtractionModel',
             'InformationExtractionModel',
+            'LSTMCRFForNamedEntityRecognition',
             'SequenceClassificationModel',
             'SingleBackboneTaskModelBase',
-            'TokenClassificationModel',
             'TaskModelForTextGeneration',
+            'TokenClassificationModel',
+            'TransformerCRFForNamedEntityRecognition',
         ],
-        'token_classification': ['SbertForTokenClassification'],
-        'table_question_answering': ['TableQuestionAnswering'],
         'sentence_embedding': ['SentenceEmbedding'],
-        'text_ranking': ['TextRanking'],
         'T5': ['T5ForConditionalGeneration'],
+        'gpt_neo': ['GPTNeoModel'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/backbones/bert.py b/modelscope/models/nlp/backbones/bert.py
deleted file mode 100644
index aa513944..00000000
--- a/modelscope/models/nlp/backbones/bert.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from modelscope.metainfo import Models
-from modelscope.models.builder import BACKBONES
-from modelscope.models.nlp.bert import BertModel
-from modelscope.utils.constant import Fields
-
-BACKBONES.register_module(
-    group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel)
diff --git a/modelscope/models/nlp/backbones/structbert.py b/modelscope/models/nlp/backbones/structbert.py
deleted file mode 100644
index 74735520..00000000
--- a/modelscope/models/nlp/backbones/structbert.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import BACKBONES
-from modelscope.models.nlp.structbert import SbertConfig
-from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform
-from modelscope.utils.constant import Fields
-from modelscope.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
-class SbertModel(TorchModel, SbertModelTransform):
-
-    def __init__(self, model_dir=None, add_pooling_layer=True, **config):
-        """
-        Args:
-            model_dir (str, optional): The model checkpoint directory. Defaults to None.
-            add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
-        """
-        config = SbertConfig(**config)
-        super().__init__(model_dir)
-        self.config = config
-        SbertModelTransform.__init__(self, config, add_pooling_layer)
-
-    def extract_sequence_outputs(self, outputs):
-        return outputs['last_hidden_state']
-
-    def extract_pooled_outputs(self, outputs):
-        return outputs['pooler_output']
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        return SbertModelTransform.forward(
-            self, input_ids, attention_mask, token_type_ids, position_ids,
-            head_mask, inputs_embeds, encoder_hidden_states,
-            encoder_attention_mask, past_key_values, use_cache,
-            output_attentions, output_hidden_states, return_dict, **kwargs)
diff --git a/modelscope/models/nlp/bart/__init__.py b/modelscope/models/nlp/bart/__init__.py
new file mode 100644
index 00000000..31912efc
--- /dev/null
+++ b/modelscope/models/nlp/bart/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .text_error_correction import BartForTextErrorCorrection
diff --git a/modelscope/models/nlp/bart_for_text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py
similarity index 100%
rename from modelscope/models/nlp/bart_for_text_error_correction.py
rename to modelscope/models/nlp/bart/text_error_correction.py
diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py
index cca79c2f..28a10f57 100644
--- a/modelscope/models/nlp/bert/__init__.py
+++ b/modelscope/models/nlp/bert/__init__.py
@@ -4,43 +4,33 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .modeling_bert import (
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForNextSentencePrediction,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
+    from .backbone import (
         BertLayer,
-        BertLMHeadModel,
         BertModel,
         BertPreTrainedModel,
-        load_tf_weights_in_bert,
     )
-
-    from .configuration_bert import BertConfig, BertOnnxConfig
-
+    from .configuration import BertConfig
+    from .fill_mask import BertForMaskedLM
+    from .text_ranking import BertForTextRanking
+    from .sentence_embedding import BertForSentenceEmbedding
+    from .text_classification import BertForSequenceClassification
+    from .token_classification import BertForTokenClassification
+    from .document_segmentation import BertForDocumentSegmentation
 else:
     _import_structure = {
-        'configuration_bert': ['BertConfig', 'BertOnnxConfig'],
+        'backbone': [
+            'BertModel',
+            'BertPreTrainedModel',
+        ],
+        'configuration': ['BertConfig'],
+        'fill_mask': ['BertForMaskedLM'],
+        'text_ranking': ['BertForTextRanking'],
+        'sentence_embedding': ['BertForSentenceEmbedding'],
+        'text_classification': ['BertForSequenceClassification'],
+        'token_classification': ['BertForTokenClassification'],
+        'document_segmentation': ['BertForDocumentSegmentation'],
     }
 
-    _import_structure['modeling_bert'] = [
-        'BertForMaskedLM',
-        'BertForMultipleChoice',
-        'BertForNextSentencePrediction',
-        'BertForPreTraining',
-        'BertForQuestionAnswering',
-        'BertForSequenceClassification',
-        'BertForTokenClassification',
-        'BertLayer',
-        'BertLMHeadModel',
-        'BertModel',
-        'BertPreTrainedModel',
-        'load_tf_weights_in_bert',
-    ]
-
     import sys
 
     sys.modules[__name__] = LazyImportModule(
diff --git a/modelscope/models/nlp/bert/backbone.py b/modelscope/models/nlp/bert/backbone.py
new file mode 100755
index 00000000..df0aebd2
--- /dev/null
+++ b/modelscope/models/nlp/bert/backbone.py
@@ -0,0 +1,952 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import (BaseModelOutputWithPastAndCrossAttentions,
+                                BaseModelOutputWithPoolingAndCrossAttentions)
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.logger import get_logger
+from .configuration import BertConfig
+
+logger = get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'BertConfig'
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when its auto-generated,
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
+            # cross attention key/value_states. Further calls to cross_attention
+            # layer can then reuse all cross-attention key/value_states (first
+            # "if" case) if uni-directional self-attention (decoder) save
+            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
+            # key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected
+            # key/value_states (third "elif" case) if encoder bi-directional
+            # self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = BertAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
+                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.get('model_dir', None)
+        if model_dir is None:
+            config = BertConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        model.model_dir = model_dir
+        return model
+
+
+@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.bert)
+class BertModel(BertPreTrainedModel):
+    """The Bert Model transformer outputting raw hidden-states without any
+    specific head on top.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+
+
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
+        config = BertConfig(**config)
+        model = cls(config, add_pooling_layer)
+        return model
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        Others (**kwargs)
+            some additional parameters might passed in from upstream pipeline,
+            which not influence the results.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
diff --git a/modelscope/models/nlp/bert/configuration_bert.py b/modelscope/models/nlp/bert/configuration.py
similarity index 99%
rename from modelscope/models/nlp/bert/configuration_bert.py
rename to modelscope/models/nlp/bert/configuration.py
index 2c9293ec..1e2cef95 100644
--- a/modelscope/models/nlp/bert/configuration_bert.py
+++ b/modelscope/models/nlp/bert/configuration.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
diff --git a/modelscope/models/nlp/bert_for_document_segmentation.py b/modelscope/models/nlp/bert/document_segmentation.py
similarity index 99%
rename from modelscope/models/nlp/bert_for_document_segmentation.py
rename to modelscope/models/nlp/bert/document_segmentation.py
index dfa57597..b46c77e4 100644
--- a/modelscope/models/nlp/bert_for_document_segmentation.py
+++ b/modelscope/models/nlp/bert/document_segmentation.py
@@ -2,6 +2,7 @@
 
 from typing import Any, Dict
 
+import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import TokenClassifierOutput
diff --git a/modelscope/models/nlp/bert/fill_mask.py b/modelscope/models/nlp/bert/fill_mask.py
new file mode 100644
index 00000000..4f81f62d
--- /dev/null
+++ b/modelscope/models/nlp/bert/fill_mask.py
@@ -0,0 +1,299 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+from .configuration import BertConfig
+
+logger = logging.get_logger(__name__)
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
+class BertForMaskedLM(BertPreTrainedModel):
+    r"""Bert Model with a `language modeling` head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Structbert, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: BertConfig, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the masked language modeling loss. Indices
+            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+            docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ...,
+            config.vocab_size]`
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_bert_backbone_base_std')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_backbone_base_std')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py
deleted file mode 100755
index 7c1dfcf5..00000000
--- a/modelscope/models/nlp/bert/modeling_bert.py
+++ /dev/null
@@ -1,1961 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model. """
-
-import math
-import warnings
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import torch
-import torch.utils.checkpoint
-from packaging import version
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.file_utils import (ModelOutput, add_start_docstrings,
-                                     add_start_docstrings_to_model_forward,
-                                     replace_return_docstrings)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
-    MultipleChoiceModelOutput, NextSentencePredictorOutput,
-    QuestionAnsweringModelOutput, SequenceClassifierOutput,
-    TokenClassifierOutput)
-from transformers.modeling_utils import (PreTrainedModel,
-                                         apply_chunking_to_forward,
-                                         find_pruneable_heads_and_indices,
-                                         prune_linear_layer)
-
-from modelscope.utils.logger import get_logger
-from .configuration_bert import BertConfig
-
-logger = get_logger(__name__)
-
-_CONFIG_FOR_DOC = 'BertConfig'
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model
-        # variable name and be able to load any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and
-        # exported when serialized
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        self.register_buffer(
-            'position_ids',
-            torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if version.parse(torch.__version__) > version.parse('1.6.0'):
-            self.register_buffer(
-                'token_type_ids',
-                torch.zeros(self.position_ids.size(), dtype=torch.long),
-                persistent=False,
-            )
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                position_ids=None,
-                inputs_embeds=None,
-                past_key_values_length=0):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:,
-                                             past_key_values_length:seq_length
-                                             + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor
-        # where it is all zeros, which usually occurs when its auto-generated,
-        # registered buffer helps users when tracing the model without passing
-        # token_type_ids, solves issue #5664
-        if token_type_ids is None:
-            if hasattr(self, 'token_type_ids'):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape,
-                    dtype=torch.long,
-                    device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == 'absolute':
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, 'embedding_size'):
-            raise ValueError(
-                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
-                f'heads ({config.num_attention_heads})')
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size
-                                       / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = position_embedding_type or getattr(
-            config, 'position_embedding_type', 'absolute')
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
-            # cross attention key/value_states. Further calls to cross_attention
-            # layer can then reuse all cross-attention key/value_states (first
-            # "if" case) if uni-directional self-attention (decoder) save
-            # Tuple(torch.Tensor, torch.Tensor) of all previous decoder
-            # key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected
-            # key/value_states (third "elif" case) if encoder bi-directional
-            # self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(
-                dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == 'relative_key':
-                relative_position_scores = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == 'relative_key_query':
-                relative_position_scores_query = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum(
-                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value, )
-        return outputs
-
-
-class BertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        self.self = BertSelfAttention(
-            config, position_embedding_type=position_embedding_type)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads,
-            self.self.attention_head_size, self.pruned_heads)
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class BertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = BertAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(
-                    f'{self} should be used as a decoder model if cross attention is added'
-                )
-            self.crossattention = BertAttention(
-                config, position_embedding_type='absolute')
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:
-                                                  2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, 'crossattention'):
-                raise ValueError(
-                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
-                    f'with cross-attention layers by setting `config.add_cross_attention=True`'
-                )
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[
-                -2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[
-                1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
-                                                 self.chunk_size_feed_forward,
-                                                 self.seq_len_dim,
-                                                 attention_output)
-        outputs = (layer_output, ) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value, )
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class BertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [BertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-        ) if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    layer_outputs[1], )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class BertPooler(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface
-    for downloading and loading pretrained models.
-    """
-
-    config_class = BertConfig
-    base_model_prefix = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, BertEncoder):
-            module.gradient_checkpointing = value
-
-
-@dataclass
-class BertForPreTrainingOutput(ModelOutput):
-    """
-    Output type of [`BertForPreTraining`].
-
-    Args:
-        loss (*optional*, returned when `labels` is provided,
-        `torch.FloatTensor` of shape `(1,)`):
-            Total loss as the sum of the masked language modeling loss and the
-            next sequence prediction (classification) loss.
-        prediction_logits (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each
-            vocabulary token before SoftMax).
-        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size,
-        2)`):
-            Prediction scores of the next sequence prediction (classification)
-            head (scores of True/False continuation before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_hidden_states=True` is passed or when
-        `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings +
-            one for the output of each layer) of shape `(batch_size,
-            sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the
-            initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
-        `output_attentions=True` is passed or when
-        `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the
-            weighted average in the self-attention heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-BERT_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass
-    documentation for the generic methods the library implements for all its
-    model (such as downloading or saving, resizing the input embeddings, pruning
-    heads etc.)
-
-    This model is also a PyTorch
-    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
-    documentation for all matter related to general usage and behavior.
-
-    Parameters:
-        config ([`BertConfig`]): Model configuration class with all the
-        parameters of the model.
-            Initializing with a config file does not load the weights associated
-            with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`BertTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the
-            inputs. Indices are selected in `[0, 1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
-        num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask
-            values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`,
-        *optional*):
-            Optionally, instead of passing `input_ids` you can choose to
-            directly pass an embedded representation. This is useful if you want
-            more control over how to convert `input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
-            plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare Bert Model transformer outputting raw hidden-states without any specific head on top.',
-    BERT_START_DOCSTRING,
-)
-class BertModel(BertPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a
-    decoder, in which case a layer of cross-attention is added between the
-    self-attention layers, following the architecture described in [Attention is
-    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
-    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
-    Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    `is_decoder` argument of the configuration set to `True`. To be used in a
-    Seq2Seq model, the model needs to initialized with both `is_decoder`
-    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
-    is then expected as an input to the forward pass.
-    """
-
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-
-        self.pooler = BertPooler(config) if add_pooling_layer else None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @classmethod
-    def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
-        config = BertConfig(**config)
-        model = cls(config, add_pooling_layer)
-        return model
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the
-            encoder. Used in the cross-attention if the model is configured as a
-            decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
-        sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of
-            the encoder input. This mask is used in the cross-attention if the
-            model is configured as a decoder. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-        `config.n_layers` with each tuple having 4 tensors of shape
-        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention
-            blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only
-            the last `decoder_input_ids` (those that don't have their past key
-            value states given to this model) of shape `(batch_size, 1)` instead
-            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned
-            and can be used to speed up decoding (see `past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else
-            self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[
-            2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                ((batch_size, seq_length + past_key_values_length)),
-                device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, 'token_type_ids'):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
-            )
-            encoder_hidden_shape = (encoder_batch_size,
-                                    encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask,
-                                       self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(
-            sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-
-    def extract_sequence_outputs(self, outputs):
-        return outputs['last_hidden_state']
-
-    def extract_pooled_outputs(self, outputs):
-        return outputs['pooler_output']
-
-
-@add_start_docstrings(
-    """
-    Bert Model with two heads on top as done during the pretraining: a `masked
-    language modeling` head and a `next sentence prediction (classification)`
-    head.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForPreTraining(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-            *optional*):
-                Labels for computing the masked language modeling loss. Indices
-                should be in `[-100, 0, ..., config.vocab_size]` (see
-                `input_ids` docstring) Tokens with indices set to `-100` are
-                ignored (masked), the loss is only computed for the tokens with
-                labels in `[0, ..., config.vocab_size]`
-            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`,
-            *optional*):
-                Labels for computing the next sequence prediction
-                (classification) loss. Input should be a sequence pair (see
-                `input_ids` docstring) Indices should be in `[0, 1]`:
-
-                - 0 indicates sequence B is a continuation of sequence A,
-                - 1 indicates sequence B is a random sequence.
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
-                Used to hide legacy arguments that have been deprecated.
-
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import BertTokenizer, BertForPreTraining
-        >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
-        ```
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, 2),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        if not return_dict:
-            output = (prediction_scores, seq_relationship_score) + outputs[2:]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return BertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """,
-    BERT_START_DOCSTRING)
-class BertLMHeadModel(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning(
-                'If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`'
-            )
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
-            sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the
-                encoder. Used in the cross-attention if the model is configured
-                as a decoder.
-            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
-            sequence_length)`, *optional*):
-                Mask to avoid performing attention on the padding token indices
-                of the encoder input. This mask is used in the cross-attention
-                if the model is configured as a decoder. Mask values selected in
-                `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-            *optional*):
-                Labels for computing the left-to-right language modeling loss
-                (next word prediction). Indices should be in `[-100, 0, ...,
-                config.vocab_size]` (see `input_ids` docstring) Tokens with
-                indices set to `-100` are ignored (masked), the loss is only
-                computed for the tokens with labels n `[0, ...,
-                config.vocab_size]`
-            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
-            `config.n_layers` with each tuple having 4 tensors of shape
-            `(batch_size, num_heads, sequence_length - 1,
-            embed_size_per_head)`):
-                Contains precomputed key and value hidden states of the
-                attention blocks. Can be used to speed up decoding.
-
-                If `past_key_values` are used, the user can optionally input
-                only the last `decoder_input_ids` (those that don't have their
-                past key value states given to this model) of shape
-                `(batch_size, 1)` instead of all `decoder_input_ids` of shape
-                `(batch_size, sequence_length)`.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are
-                returned and can be used to speed up decoding (see
-                `past_key_values`).
-
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import BertTokenizer, BertLMHeadModel,
-        BertConfig >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-        >>> config = BertConfig.from_pretrained("bert-base-cased")
-        >>> config.is_decoder = True
-        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> prediction_logits = outputs.logits
-        ```
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :
-                                                          -1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING)
-class BertForMaskedLM(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.cls = BertOnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-        *optional*):
-            Labels for computing the masked language modeling loss. Indices
-            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
-            docstring) Tokens with indices set to `-100` are ignored (masked),
-            the loss is only computed for the tokens with labels in `[0, ...,
-            config.vocab_size]`
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        if self.config.pad_token_id is None:
-            raise ValueError('The PAD token should be defined for generation')
-
-        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
-        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
-        dummy_token = torch.full((effective_batch_size, 1),
-                                 self.config.pad_token_id,
-                                 dtype=torch.long,
-                                 device=input_ids.device)
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {'input_ids': input_ids, 'attention_mask': attention_mask}
-
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING,
-)
-class BertForNextSentencePrediction(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the next sequence prediction (classification)
-            loss. Input should be a sequence pair (see `input_ids` docstring).
-            Indices should be in `[0, 1]`:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-
-        Returns:
-
-        Example:
-
-        ```python >>> from transformers import BertTokenizer,
-        BertForNextSentencePrediction >>> import torch
-
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
-
-        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
-        ```
-        """
-
-        if 'next_sentence_label' in kwargs:
-            warnings.warn(
-                'The `next_sentence_label` argument is deprecated, use `labels` instead.',
-                FutureWarning,
-            )
-            labels = kwargs.pop('next_sentence_label')
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        seq_relationship_scores = self.cls(pooled_output)
-
-        next_sentence_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(
-                seq_relationship_scores.view(-1, 2), labels.view(-1))
-
-        if not return_dict:
-            output = (seq_relationship_scores, ) + outputs[2:]
-            return ((next_sentence_loss, )
-                    + output) if next_sentence_loss is not None else output
-
-        return NextSentencePredictorOutput(
-            loss=next_sentence_loss,
-            logits=seq_relationship_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model transformer with a sequence classification/regression head on top
-    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForSequenceClassification(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-
-        self.bert = BertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in `[0, ..., config.num_labels - 1]`. If
-            `config.num_labels == 1` a regression loss is computed (Mean-Square
-            loss), If `config.num_labels > 1` a classification loss is computed
-            (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model with a multiple choice classification head on top (a linear layer
-    on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForMultipleChoice(BertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format(
-            'batch_size, num_choices, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in `[0, ..., num_choices-1]` where `num_choices`
-            is the size of the second dimension of the input tensors. (See
-            `input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        num_choices = input_ids.shape[
-            1] if input_ids is not None else inputs_embeds.shape[1]
-
-        input_ids = input_ids.view(
-            -1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2),
-                               inputs_embeds.size(-1))
-            if inputs_embeds is not None else None)
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-
-        if not return_dict:
-            output = (reshaped_logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForTokenClassification(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
-        *optional*):
-            Labels for computing the token classification loss. Indices should
-            be in `[0, ..., config.num_labels - 1]`.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Bert Model with a span classification head on top for extractive
-    question-answering tasks like SQuAD (a linear layers on top of the
-    hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    BERT_START_DOCSTRING,
-)
-class BertForQuestionAnswering(BertPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config, add_pooling_layer=False)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`,
-        *optional*):
-            Labels for position (index) of the start of the labelled span for
-            computing the token classification loss. Positions are clamped to
-            the length of the sequence (`sequence_length`). Position outside of
-            the sequence are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for
-            computing the token classification loss. Positions are clamped to
-            the length of the sequence (`sequence_length`). Position outside of
-            the sequence are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/bert/sentence_embedding.py b/modelscope/models/nlp/bert/sentence_embedding.py
new file mode 100644
index 00000000..f4c2620e
--- /dev/null
+++ b/modelscope/models/nlp/bert/sentence_embedding.py
@@ -0,0 +1,113 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.builder import MODELS
+from modelscope.outputs import BackboneModelOutput
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+
+
+@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
+class BertForSentenceEmbedding(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        setattr(self, self.base_model_prefix,
+                BertModel(config, add_pooling_layer=False))
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ) -> BackboneModelOutput:
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
+            >>> print(model(**preprocessor('This is a test')))
+        """
+        return self.base_model.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+        model_dir = kwargs.get('model_dir')
+        model = super(
+            Model,
+            cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        model.model_dir = model_dir
+        return model
diff --git a/modelscope/models/nlp/bert/text_classification.py b/modelscope/models/nlp/bert/text_classification.py
new file mode 100644
index 00000000..b1d18d0f
--- /dev/null
+++ b/modelscope/models/nlp/bert/text_classification.py
@@ -0,0 +1,208 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.nli, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.bert)
+class BertForSequenceClassification(BertPreTrainedModel):
+    r"""Bert Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Bert, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        setattr(self, self.base_model_prefix, BertModel(config))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.base_model.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py
new file mode 100644
index 00000000..79a63045
--- /dev/null
+++ b/modelscope/models/nlp/bert/text_ranking.py
@@ -0,0 +1,89 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.utils.checkpoint
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel
+from .text_classification import BertForSequenceClassification
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
+class BertForTextRanking(BertForSequenceClassification):
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.train_batch_size = kwargs.get('train_batch_size', 4)
+        setattr(self, self.base_model_prefix,
+                BertModel(self.config, add_pooling_layer=True))
+        self.register_buffer(
+            'target_label',
+            torch.zeros(self.train_batch_size, dtype=torch.long))
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs) -> AttentionTextClassificationModelOutput:
+        outputs = self.base_model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if self.base_model.training:
+            scores = logits.view(self.train_batch_size, -1)
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(scores, self.target_label)
+            return AttentionTextClassificationModelOutput(
+                loss=loss,
+                logits=logits,
+            )
+        return AttentionTextClassificationModelOutput(logits=logits, )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (1 classes).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        num_labels = kwargs.get('num_labels', 1)
+        model_args = {} if num_labels is None else {'num_labels': num_labels}
+
+        model_dir = kwargs.get('model_dir')
+        model = super(Model, cls).from_pretrained(
+            pretrained_model_name_or_path=model_dir, **model_args)
+        model.model_dir = model_dir
+        return model
diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py
new file mode 100644
index 00000000..5dc6b0ce
--- /dev/null
+++ b/modelscope/models/nlp/bert/token_classification.py
@@ -0,0 +1,225 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import BertModel, BertPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.bert)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
+class BertForTokenClassification(BertPreTrainedModel):
+    r"""Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Bert, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        setattr(self, self.base_model_prefix,
+                BertModel(config, add_pooling_layer=False))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+    ):
+        r"""
+        Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using
+            :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in ``[0, 1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or
+        :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert :obj:`input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention
+            layers. See ``attentions`` under returned tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See
+            ``hidden_states`` under returned tensors for more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput`
+            instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
+        `optional`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If
+            :obj:`config.num_labels == 1` a regression loss is computed
+            (Mean-Square loss), If :obj:`config.num_labels > 1` a classification
+            loss is computed (Cross-Entropy).
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.TokenClassifierOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=offset_mapping,
+        )
diff --git a/modelscope/models/nlp/csanmt/__init__.py b/modelscope/models/nlp/csanmt/__init__.py
new file mode 100644
index 00000000..85531617
--- /dev/null
+++ b/modelscope/models/nlp/csanmt/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .translation import CsanmtForTranslation
diff --git a/modelscope/models/nlp/csanmt_for_translation.py b/modelscope/models/nlp/csanmt/translation.py
similarity index 100%
rename from modelscope/models/nlp/csanmt_for_translation.py
rename to modelscope/models/nlp/csanmt/translation.py
diff --git a/modelscope/models/nlp/deberta_v2/__init__.py b/modelscope/models/nlp/deberta_v2/__init__.py
index 830210ed..08b184e5 100644
--- a/modelscope/models/nlp/deberta_v2/__init__.py
+++ b/modelscope/models/nlp/deberta_v2/__init__.py
@@ -22,38 +22,28 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_deberta_v2 import DebertaV2Config
-    from .tokenization_deberta_v2 import DebertaV2Tokenizer
-    from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
-
-    from .modeling_deberta_v2 import (
-        DebertaV2ForMaskedLM,
-        DebertaV2ForMultipleChoice,
-        DebertaV2ForQuestionAnswering,
-        DebertaV2ForSequenceClassification,
-        DebertaV2ForTokenClassification,
+    from .configuration import DebertaV2Config
+    from .tokenization import DebertaV2Tokenizer
+    from .tokenization_fast import DebertaV2TokenizerFast
+    from .backbone import (
         DebertaV2Model,
         DebertaV2PreTrainedModel,
     )
+    from .fill_mask import DebertaV2ForMaskedLM
 
 else:
     _import_structure = {
-        'configuration_deberta_v2':
-        ['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
-        'tokenization_deberta_v2': ['DebertaV2Tokenizer']
+        'configuration': ['DebertaV2Config'],
+        'tokenization': ['DebertaV2Tokenizer'],
+        'tokenization_fast': ['DebertaV2TokenizerFast'],
+        'backbone': [
+            'DebertaV2Model',
+            'DebertaV2PreTrainedModel',
+        ],
+        'fill_mask': [
+            'DebertaV2ForMaskedLM',
+        ]
     }
-    _import_structure['tokenization_deberta_v2_fast'] = [
-        'DebertaV2TokenizerFast'
-    ]
-    _import_structure['modeling_deberta_v2'] = [
-        'DebertaV2ForMaskedLM',
-        'DebertaV2ForMultipleChoice',
-        'DebertaV2ForQuestionAnswering',
-        'DebertaV2ForSequenceClassification',
-        'DebertaV2ForTokenClassification',
-        'DebertaV2Model',
-        'DebertaV2PreTrainedModel',
-    ]
     import sys
 
     sys.modules[__name__] = LazyImportModule(
diff --git a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py b/modelscope/models/nlp/deberta_v2/backbone.py
similarity index 64%
rename from modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
rename to modelscope/models/nlp/deberta_v2/backbone.py
index 1c6b9071..cca38133 100644
--- a/modelscope/models/nlp/deberta_v2/modeling_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/backbone.py
@@ -20,28 +20,22 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+from torch.nn import LayerNorm
 from transformers.activations import ACT2FN
-from transformers.file_utils import (add_code_sample_docstrings,
-                                     add_start_docstrings,
-                                     add_start_docstrings_to_model_forward)
-from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput,
-                                           MultipleChoiceModelOutput,
-                                           QuestionAnsweringModelOutput,
-                                           SequenceClassifierOutput,
-                                           TokenClassifierOutput)
+from transformers.modeling_outputs import BaseModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import softmax_backward_data
 
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
 from modelscope.utils import logger as logging
-from .configuration_deberta_v2 import DebertaV2Config
+from modelscope.utils.constant import Tasks
+from .configuration import DebertaV2Config
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = 'DebertaV2Config'
-_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer'
-_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite'
-
 
 # Copied from transformers.models.deberta.modeling_deberta.ContextPooler
 class ContextPooler(nn.Module):
@@ -1006,7 +1000,7 @@ class DebertaV2Embeddings(nn.Module):
 
 
 # Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
-class DebertaV2PreTrainedModel(PreTrainedModel):
+class DebertaV2PreTrainedModel(TorchModel, PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
@@ -1018,6 +1012,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_unexpected = ['position_embeddings']
     supports_gradient_checkpointing = True
 
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
     def _init_weights(self, module):
         """Initialize the weights."""
         if isinstance(module, nn.Linear):
@@ -1037,8 +1035,24 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
         if isinstance(module, DebertaV2Encoder):
             module.gradient_checkpointing = value
 
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = DebertaV2Config(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.deberta_v2)
+# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
+class DebertaV2Model(DebertaV2PreTrainedModel):
+    """The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top.
 
-DEBERTA_START_DOCSTRING = r"""
     The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
     Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
     on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
@@ -1048,65 +1062,13 @@ DEBERTA_START_DOCSTRING = r"""
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
 
-
     Parameters:
-        config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
+        config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-DEBERTA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.',
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
-class DebertaV2Model(DebertaV2PreTrainedModel):
+            configuration.
+    """
 
-    def __init__(self, config):
+    def __init__(self, config, **kwargs):
         super().__init__(config)
 
         self.embeddings = DebertaV2Embeddings(config)
@@ -1130,14 +1092,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
         raise NotImplementedError(
             'The prune function is not implemented in DeBERTa model.')
 
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -1148,7 +1102,53 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> Union[Tuple, AttentionBackboneModelOutput]:
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
+            Indices of input sequence tokens in the vocabulary.
+
+        attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+        position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+        inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a dataclass instead of a plain tuple.
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
+            >>> print(model(**preprocessor('这是个测试')))
+        """
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else
@@ -1216,574 +1216,9 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
             return (sequence_output, ) + encoder_outputs[
                 (1 if output_hidden_states else 2):]
 
-        return BaseModelOutput(
+        return AttentionBackboneModelOutput(
             last_hidden_state=sequence_output,
             hidden_states=encoder_outputs.hidden_states
             if output_hidden_states else None,
             attentions=encoder_outputs.attentions,
         )
-
-
-@add_start_docstrings(
-    """DeBERTa Model with a `language modeling` head on top.""",
-    DEBERTA_START_DOCSTRING)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
-class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.deberta = DebertaV2Model(config)
-        self.cls = DebertaV2OnlyMLMHead(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, MaskedLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
-            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[1:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
-class DebertaV2PredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
-class DebertaV2LMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = DebertaV2PredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
-class DebertaV2OnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = DebertaV2LMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
-class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        num_labels = getattr(config, 'num_labels', 2)
-        self.num_labels = num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.pooler = ContextPooler(config)
-        output_dim = self.pooler.output_dim
-
-        self.classifier = nn.Linear(output_dim, num_labels)
-        drop_out = getattr(config, 'cls_dropout', None)
-        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-        self.dropout = StableDropout(drop_out)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.deberta.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.deberta.set_input_embeddings(new_embeddings)
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        encoder_layer = outputs[0]
-        pooled_output = self.pooler(encoder_layer)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    # regression task
-                    loss_fn = nn.MSELoss()
-                    logits = logits.view(-1).to(labels.dtype)
-                    loss = loss_fn(logits, labels.view(-1))
-                elif labels.dim() == 1 or labels.size(-1) == 1:
-                    label_index = (labels >= 0).nonzero()
-                    labels = labels.long()
-                    if label_index.size(0) > 0:
-                        labeled_logits = torch.gather(
-                            logits, 0,
-                            label_index.expand(
-                                label_index.size(0), logits.size(1)))
-                        labels = torch.gather(labels, 0, label_index.view(-1))
-                        loss_fct = CrossEntropyLoss()
-                        loss = loss_fct(
-                            labeled_logits.view(-1, self.num_labels).float(),
-                            labels.view(-1))
-                    else:
-                        loss = torch.tensor(0).to(logits)
-                else:
-                    log_softmax = nn.LogSoftmax(-1)
-                    loss = -((log_softmax(logits) * labels).sum(-1)).mean()
-            elif self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions)
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
-class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions)
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
-class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=QuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        start_positions: Optional[torch.Tensor] = None,
-        end_positions: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.deberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[1:]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        num_labels = getattr(config, 'num_labels', 2)
-        self.num_labels = num_labels
-
-        self.deberta = DebertaV2Model(config)
-        self.pooler = ContextPooler(config)
-        output_dim = self.pooler.output_dim
-
-        self.classifier = nn.Linear(output_dim, 1)
-        drop_out = getattr(config, 'cls_dropout', None)
-        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-        self.dropout = StableDropout(drop_out)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.deberta.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        self.deberta.set_input_embeddings(new_embeddings)
-
-    @add_start_docstrings_to_model_forward(
-        DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MultipleChoiceModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
-            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
-            `input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        num_choices = input_ids.shape[
-            1] if input_ids is not None else inputs_embeds.shape[1]
-
-        flat_input_ids = input_ids.view(
-            -1, input_ids.size(-1)) if input_ids is not None else None
-        flat_position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        flat_token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        flat_attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        flat_inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2),
-                               inputs_embeds.size(-1))
-            if inputs_embeds is not None else None)
-
-        outputs = self.deberta(
-            flat_input_ids,
-            position_ids=flat_position_ids,
-            token_type_ids=flat_token_type_ids,
-            attention_mask=flat_attention_mask,
-            inputs_embeds=flat_inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        encoder_layer = outputs[0]
-        pooled_output = self.pooler(encoder_layer)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-
-        if not return_dict:
-            output = (reshaped_logits, ) + outputs[1:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py b/modelscope/models/nlp/deberta_v2/configuration.py
similarity index 98%
rename from modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
rename to modelscope/models/nlp/deberta_v2/configuration.py
index 65e8f0b7..7921ca2f 100644
--- a/modelscope/models/nlp/deberta_v2/configuration_deberta_v2.py
+++ b/modelscope/models/nlp/deberta_v2/configuration.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
-from collections import OrderedDict
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
 
 from transformers import PretrainedConfig
 
diff --git a/modelscope/models/nlp/deberta_v2/fill_mask.py b/modelscope/models/nlp/deberta_v2/fill_mask.py
new file mode 100644
index 00000000..ed127d4c
--- /dev/null
+++ b/modelscope/models/nlp/deberta_v2/fill_mask.py
@@ -0,0 +1,230 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2020 Microsoft and the Hugging Face Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils.constant import Tasks
+from .backbone import DebertaV2Model, DebertaV2PreTrainedModel
+
+
+# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    r"""DeBERTa_v2 Model with a `language modeling` head on top.
+
+    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
+    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
+    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Deberta_v2, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, AttentionFillMaskModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
+                Indices of input sequence tokens in the vocabulary.
+
+            attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+                1]`:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+            position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings.
+                Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert *input_ids* indices into associated
+                vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a dataclass instead of a plain tuple.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可[MASK]不动我。'))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[1:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            input_ids=input_ids,
+            attentions=outputs.attentions,
+            hidden_states=outputs.hidden_states)
+
+
+# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
+class DebertaV2PredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
+class DebertaV2LMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
+class DebertaV2OnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py b/modelscope/models/nlp/deberta_v2/tokenization.py
similarity index 100%
rename from modelscope/models/nlp/deberta_v2/tokenization_deberta_v2.py
rename to modelscope/models/nlp/deberta_v2/tokenization.py
diff --git a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py b/modelscope/models/nlp/deberta_v2/tokenization_fast.py
similarity index 99%
rename from modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
rename to modelscope/models/nlp/deberta_v2/tokenization_fast.py
index a1fcecf4..913ea5bd 100644
--- a/modelscope/models/nlp/deberta_v2/tokenization_deberta_v2_fast.py
+++ b/modelscope/models/nlp/deberta_v2/tokenization_fast.py
@@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 from modelscope.utils import logger as logging
 
 if is_sentencepiece_available():
-    from .tokenization_deberta_v2 import DebertaV2Tokenizer
+    from .tokenization import DebertaV2Tokenizer
 else:
     DebertaV2Tokenizer = None
 
diff --git a/modelscope/models/nlp/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py
index 9cae8cc8..051cc8f2 100644
--- a/modelscope/models/nlp/gpt3/__init__.py
+++ b/modelscope/models/nlp/gpt3/__init__.py
@@ -4,16 +4,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_gpt3 import GPT3Config
-    from .modeling_gpt3 import GPT3Model
-    from .gpt3_for_text_generation import GPT3ForTextGeneration
-    from .tokenizer_gpt3 import JiebaBPETokenizer
+    from .configuration import GPT3Config
+    from .backbone import GPT3Model
+    from .text_generation import GPT3ForTextGeneration
+    from .tokenizer import JiebaBPETokenizer
 else:
     _import_structure = {
-        'configuration_gpt3': ['GPT3Config'],
-        'modeling_gpt3': ['GPT3Model'],
-        'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
-        'tokenizer_gpt3': ['JiebaBPETokenizer'],
+        'configuration': ['GPT3Config'],
+        'backbone': ['GPT3Model'],
+        'text_generation': ['GPT3ForTextGeneration'],
+        'tokenizer': ['JiebaBPETokenizer'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/backbone.py
similarity index 99%
rename from modelscope/models/nlp/gpt3/modeling_gpt3.py
rename to modelscope/models/nlp/gpt3/backbone.py
index 2c23f5db..587c7a9d 100644
--- a/modelscope/models/nlp/gpt3/modeling_gpt3.py
+++ b/modelscope/models/nlp/gpt3/backbone.py
@@ -24,7 +24,7 @@ from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
 from modelscope.utils.constant import ModelFile
-from .configuration_gpt3 import GPT3Config
+from .configuration import GPT3Config
 
 
 class GPT3SelfAttention(nn.Module):
diff --git a/modelscope/models/nlp/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration.py
similarity index 100%
rename from modelscope/models/nlp/gpt3/configuration_gpt3.py
rename to modelscope/models/nlp/gpt3/configuration.py
diff --git a/modelscope/models/nlp/gpt3/gpt3_for_text_generation.py b/modelscope/models/nlp/gpt3/text_generation.py
similarity index 100%
rename from modelscope/models/nlp/gpt3/gpt3_for_text_generation.py
rename to modelscope/models/nlp/gpt3/text_generation.py
diff --git a/modelscope/models/nlp/gpt3/tokenizer_gpt3.py b/modelscope/models/nlp/gpt3/tokenizer.py
similarity index 100%
rename from modelscope/models/nlp/gpt3/tokenizer_gpt3.py
rename to modelscope/models/nlp/gpt3/tokenizer.py
diff --git a/modelscope/models/nlp/backbones/__init__.py b/modelscope/models/nlp/gpt_neo/__init__.py
similarity index 83%
rename from modelscope/models/nlp/backbones/__init__.py
rename to modelscope/models/nlp/gpt_neo/__init__.py
index 749cf995..ef5fdee5 100644
--- a/modelscope/models/nlp/backbones/__init__.py
+++ b/modelscope/models/nlp/gpt_neo/__init__.py
@@ -4,14 +4,12 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .structbert import SbertModel
+    from .backbone import GPTNeoModel
 else:
     _import_structure = {
-        'structbert': ['SbertModel'],
+        'backbone': ['GPTNeoModel'],
     }
-
     import sys
-
     sys.modules[__name__] = LazyImportModule(
         __name__,
         globals()['__file__'],
diff --git a/modelscope/models/nlp/backbones/gpt_neo.py b/modelscope/models/nlp/gpt_neo/backbone.py
similarity index 74%
rename from modelscope/models/nlp/backbones/gpt_neo.py
rename to modelscope/models/nlp/gpt_neo/backbone.py
index a2d0c374..a809bcde 100644
--- a/modelscope/models/nlp/backbones/gpt_neo.py
+++ b/modelscope/models/nlp/gpt_neo/backbone.py
@@ -4,10 +4,11 @@ from transformers import GPTNeoModel as GPTNeoModelTransform
 
 from modelscope.metainfo import Models
 from modelscope.models.builder import BACKBONES
-from modelscope.utils.constant import Fields
+from modelscope.utils.constant import Tasks
 
 
-@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo)
+@BACKBONES.register_module(
+    group_key=Tasks.backbone, module_name=Models.gpt_neo)
 class GPTNeoModel(GPTNeoModelTransform):
 
     def __init__(self, **kwargs):
diff --git a/modelscope/models/nlp/heads/token_classification_head.py b/modelscope/models/nlp/heads/token_classification_head.py
index 3f19ca67..443f93df 100644
--- a/modelscope/models/nlp/heads/token_classification_head.py
+++ b/modelscope/models/nlp/heads/token_classification_head.py
@@ -37,9 +37,9 @@ class TokenClassificationHead(TorchHead):
             sequence_output = inputs
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
-        return {OutputKeys.LOGITS: logits}
+        return logits
 
     def compute_loss(self, outputs: Dict[str, torch.Tensor],
                      labels) -> Dict[str, torch.Tensor]:
         logits = outputs[OutputKeys.LOGITS]
-        return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
+        return F.cross_entropy(logits, labels)
diff --git a/modelscope/models/nlp/masked_language.py b/modelscope/models/nlp/masked_language.py
deleted file mode 100644
index b7a890c1..00000000
--- a/modelscope/models/nlp/masked_language.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import \
-    BertForMaskedLM as BertForMaskedLMTransformer
-from modelscope.models.nlp.deberta_v2 import \
-    DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
-from modelscope.models.nlp.structbert import SbertForMaskedLM
-from modelscope.models.nlp.veco import \
-    VecoForMaskedLM as VecoForMaskedLMTransformer
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
-class StructBertForMaskedLM(TorchModel, SbertForMaskedLM):
-    """Structbert for MLM model.
-
-    Inherited from structbert.SbertForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        SbertForMaskedLM.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = SbertForMaskedLM.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained(
-            pretrained_model_name_or_path=model_dir, model_dir=model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
-class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer):
-    """Bert for MLM model.
-
-    Inherited from transformers.BertForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        BertForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = BertForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(BertForMaskedLMTransformer,
-                     BertForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
-class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
-    """Veco for MLM model.
-
-    Inherited from veco.VecoForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        VecoForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = VecoForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(VecoForMaskedLMTransformer,
-                     VecoForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
-class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
-    """Deberta v2 for MLM model.
-
-    Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        DebertaV2ForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = DebertaV2ForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(DebertaV2ForMaskedLMTransformer,
-                     DebertaV2ForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/palm_v2/__init__.py b/modelscope/models/nlp/palm_v2/__init__.py
index 3a9960ec..45ab6621 100644
--- a/modelscope/models/nlp/palm_v2/__init__.py
+++ b/modelscope/models/nlp/palm_v2/__init__.py
@@ -17,19 +17,19 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_palm import PalmConfig
-    from .modeling_palm import (
+    from .configuration import PalmConfig
+    from .backbone import (
         AbsSummarizer,
         PalmForConditionalGeneration,
         Translator,
     )
-    from .palm_for_text_generation import PalmForTextGeneration
+    from .text_generation import PalmForTextGeneration
 else:
     _import_structure = {
-        'configuration_palm': ['PalmConfig'],
-        'modeling_palm':
+        'configuration': ['PalmConfig'],
+        'backbone':
         ['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'],
-        'palm_for_text_generation': ['PalmForTextGeneration'],
+        'text_generation': ['PalmForTextGeneration'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/palm_v2/modeling_palm.py b/modelscope/models/nlp/palm_v2/backbone.py
similarity index 99%
rename from modelscope/models/nlp/palm_v2/modeling_palm.py
rename to modelscope/models/nlp/palm_v2/backbone.py
index f395ebd4..3e0ff805 100644
--- a/modelscope/models/nlp/palm_v2/modeling_palm.py
+++ b/modelscope/models/nlp/palm_v2/backbone.py
@@ -35,7 +35,7 @@ from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
 
 from modelscope.utils import logger as logging
-from .configuration_palm import PalmConfig
+from .configuration import PalmConfig
 from .dureader_eval import compute_bleu_rouge, normalize
 
 CONFIG_NAME = 'config.json'
diff --git a/modelscope/models/nlp/palm_v2/configuration_palm.py b/modelscope/models/nlp/palm_v2/configuration.py
similarity index 100%
rename from modelscope/models/nlp/palm_v2/configuration_palm.py
rename to modelscope/models/nlp/palm_v2/configuration.py
diff --git a/modelscope/models/nlp/palm_v2/palm_for_text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py
similarity index 100%
rename from modelscope/models/nlp/palm_v2/palm_for_text_generation.py
rename to modelscope/models/nlp/palm_v2/text_generation.py
diff --git a/modelscope/models/nlp/plug/__init__.py b/modelscope/models/nlp/plug/__init__.py
index dbc20751..589a636a 100644
--- a/modelscope/models/nlp/plug/__init__.py
+++ b/modelscope/models/nlp/plug/__init__.py
@@ -4,13 +4,13 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_plug import PlugNLGConfig
-    from .modeling_plug import PlugModel
+    from .configuration import PlugNLGConfig
+    from .backbone import PlugModel
     from .distributed_plug import DistributedPlug
 else:
     _import_structure = {
-        'configuration_plug': ['PlugNLGConfig'],
-        'modeling_plug': ['PlugModel'],
+        'configuration': ['PlugNLGConfig'],
+        'backbone': ['PlugModel'],
         'distributed_plug': ['DistributedPlug'],
     }
 
diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/backbone.py
similarity index 99%
rename from modelscope/models/nlp/plug/modeling_plug.py
rename to modelscope/models/nlp/plug/backbone.py
index df00006b..7f3f12de 100644
--- a/modelscope/models/nlp/plug/modeling_plug.py
+++ b/modelscope/models/nlp/plug/backbone.py
@@ -28,7 +28,7 @@ from torch import nn
 
 from modelscope.utils.nlp.distributed import (normal_init_method,
                                               scaled_init_method)
-from .configuration_plug import PlugNLGConfig, PlugNLUConfig
+from .configuration import PlugNLGConfig, PlugNLUConfig
 
 logger = logging.getLogger(__name__)
 
diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration.py
similarity index 100%
rename from modelscope/models/nlp/plug/configuration_plug.py
rename to modelscope/models/nlp/plug/configuration.py
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
index 06009ba1..c72e92ba 100644
--- a/modelscope/models/nlp/plug/distributed_plug.py
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from typing import Dict
 
@@ -14,7 +15,7 @@ from modelscope.utils.nlp.distributed import initialize_distributed
 from modelscope.utils.nlp.load_checkpoint import pre_load
 from modelscope.utils.torch_utils import set_random_seed_mpu
 from . import PlugModel
-from .configuration_plug import PlugNLGConfig
+from .configuration import PlugNLGConfig
 
 logger = get_logger(__name__)
 
diff --git a/modelscope/models/nlp/ponet/__init__.py b/modelscope/models/nlp/ponet/__init__.py
index 6d26b194..df996167 100644
--- a/modelscope/models/nlp/ponet/__init__.py
+++ b/modelscope/models/nlp/ponet/__init__.py
@@ -18,16 +18,16 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_ponet import PoNetConfig
-    from .modeling_ponet import (PoNetForMaskedLM, PoNetModel,
-                                 PoNetPreTrainedModel)
-    from .tokenization_ponet import PoNetTokenizer
+    from .configuration import PoNetConfig
+    from .backbone import (PoNetModel, PoNetPreTrainedModel)
+    from .tokenization import PoNetTokenizer
+    from .fill_mask import PoNetForMaskedLM
 else:
     _import_structure = {
-        'configuration_ponet': ['PoNetConfig'],
-        'modeling_ponet':
-        ['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'],
-        'tokenization_ponet': ['PoNetTokenizer'],
+        'configuration': ['PoNetConfig'],
+        'backbone': ['PoNetModel', 'PoNetPreTrainedModel'],
+        'fill_mask': ['PoNetForMaskedLM'],
+        'tokenization': ['PoNetTokenizer'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/ponet/modeling_ponet.py b/modelscope/models/nlp/ponet/backbone.py
similarity index 55%
rename from modelscope/models/nlp/ponet/modeling_ponet.py
rename to modelscope/models/nlp/ponet/backbone.py
index f37954db..fcc62fa2 100644
--- a/modelscope/models/nlp/ponet/modeling_ponet.py
+++ b/modelscope/models/nlp/ponet/backbone.py
@@ -16,43 +16,32 @@
 """PyTorch PoNet model. """
 
 import math
-from dataclasses import dataclass
 from distutils.version import LooseVersion
-from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
-                                     add_start_docstrings,
-                                     add_start_docstrings_to_model_forward,
-                                     replace_return_docstrings)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
-    SequenceClassifierOutput, TokenClassifierOutput)
+from transformers.modeling_outputs import \
+    BaseModelOutputWithPastAndCrossAttentions
 from transformers.modeling_utils import (PreTrainedModel,
                                          apply_chunking_to_forward,
                                          find_pruneable_heads_and_indices,
                                          prune_linear_layer)
-from transformers.models.bert.modeling_bert import \
-    load_tf_weights_in_bert as load_tf_weights_in_ponet
 
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
-from .configuration_ponet import PoNetConfig
+from .configuration import PoNetConfig
 
 logger = get_logger(__name__)
 
 is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0')
 
-_CHECKPOINT_FOR_DOC = 'ponet-base-uncased'
-_CONFIG_FOR_DOC = 'PoNetConfig'
-_TOKENIZER_FOR_DOC = 'PoNetTokenizer'
-
 CLS_ID = 101
 EOS_ID = 102
 
@@ -609,82 +598,20 @@ class PoNetPooler(nn.Module):
         return pooled_output
 
 
-class PoNetPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class PoNetLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = PoNetPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(
-            config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class PoNetOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = PoNetLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class PoNetPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = PoNetLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 3)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class PoNetPreTrainedModel(PreTrainedModel):
+class PoNetPreTrainedModel(TorchModel, PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
     config_class = PoNetConfig
-    load_tf_weights = load_tf_weights_in_ponet
     base_model_prefix = 'ponet'
     _keys_to_ignore_on_load_missing = [r'position_ids']
 
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -703,51 +630,22 @@ class PoNetPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
-
-@dataclass
-class PoNetForPreTrainingOutput(ModelOutput):
-    """
-    Output type of :class:`~transformers.PoNetForPreTraining`.
-
-    Args:
-        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Masked language modeling loss.
-        sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            sop loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states
-            (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed
-            or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed
-            or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    mlm_loss: Optional[torch.FloatTensor] = None
-    sop_loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = PoNetConfig(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
 
 
-PONET_START_DOCSTRING = r"""
+@MODELS.register_module(Tasks.backbone, module_name=Models.ponet)
+class PoNetModel(PoNetPreTrainedModel):
+    """The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.
 
     This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
@@ -763,65 +661,6 @@ PONET_START_DOCSTRING = r"""
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
             weights.
-"""
-
-PONET_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.',
-    PONET_START_DOCSTRING,
-)
-class PoNetModel(PoNetPreTrainedModel):
-    """
 
     The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
     cross-attention is added between the self-attention layers, following the architecture described in `Attention is
@@ -834,8 +673,8 @@ class PoNetModel(PoNetPreTrainedModel):
     input to the forward pass.
     """
 
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
+    def __init__(self, config, add_pooling_layer=True, **kwargs):
+        super().__init__(config, **kwargs)
         self.config = config
 
         self.embeddings = PoNetEmbeddings(config)
@@ -859,14 +698,6 @@ class PoNetModel(PoNetPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
     def forward(
         self,
         input_ids=None,
@@ -885,6 +716,49 @@ class PoNetModel(PoNetPreTrainedModel):
         return_dict=None,
     ):
         r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
         encoder_hidden_states
             (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
@@ -906,6 +780,16 @@ class PoNetModel(PoNetPreTrainedModel):
         use_cache (:obj:`bool`, `optional`):
             If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
             decoding (see :obj:`past_key_values`).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
+            >>> print(model(**preprocessor('这是个测试')))
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1006,7 +890,7 @@ class PoNetModel(PoNetPreTrainedModel):
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
 
-        return BaseModelOutputWithPoolingAndCrossAttentions(
+        return AttentionBackboneModelOutput(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             past_key_values=encoder_outputs.past_key_values,
@@ -1014,578 +898,3 @@ class PoNetModel(PoNetPreTrainedModel):
             attentions=encoder_outputs.attentions,
             cross_attentions=encoder_outputs.cross_attentions,
         )
-
-
-@add_start_docstrings(
-    """
-    PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
-    sentence prediction (classification)` head.
-    """,
-    PONET_START_DOCSTRING,
-)
-class PoNetForPreTraining(PoNetPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.ponet = PoNetModel(config)
-        self.cls = PoNetPreTrainingHeads(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
-
-        Returns:
-
-        Example::
-
-            >>> from transformers import PoNetTokenizer, PoNetForPreTraining
-            >>> import torch
-
-            >>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased')
-            >>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased')
-
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-            >>> outputs = model(**inputs)
-
-            >>> prediction_logits = outputs.prediction_logits
-            >>> seq_relationship_logits = outputs.seq_relationship_logits
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        total_loss = None
-        masked_lm_loss = None
-        next_sentence_loss = None
-        if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, 3),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        if not return_dict:
-            output = (prediction_scores, seq_relationship_score) + outputs[2:]
-            return ((total_loss, masked_lm_loss, next_sentence_loss)
-                    + output) if total_loss is not None else output
-
-        return PoNetForPreTrainingOutput(
-            loss=total_loss,
-            mlm_loss=masked_lm_loss,
-            sop_loss=next_sentence_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """PoNet Model with a `language modeling` head on top for CLM fine-tuning. """,
-    PONET_START_DOCSTRING)
-class PoNetLMHeadModel(PoNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning(
-                'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`'
-            )
-
-        self.ponet = PoNetModel(config, add_pooling_layer=False)
-        self.cls = PoNetOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:
-            `(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-            with each tuple having 4 tensors of shape :
-            obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-
-        Returns:
-
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :
-                                                          -1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """PoNet Model with a `language modeling` head on top. """,
-    PONET_START_DOCSTRING)
-class PoNetForMaskedLM(PoNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.ponet = PoNetModel(config, add_pooling_layer=False)
-        self.cls = PoNetOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        segment_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
-    output) e.g. for GLUE tasks.
-    """,
-    PONET_START_DOCSTRING,
-)
-class PoNetForSequenceClassification(PoNetPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-
-        self.ponet = PoNetModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    PONET_START_DOCSTRING,
-)
-class PoNetForTokenClassification(PoNetPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.ponet = PoNetModel(config, add_pooling_layer=False)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(
-        PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        segment_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.ponet(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits, ) + outputs[2:]
-            return ((loss, ) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/ponet/configuration_ponet.py b/modelscope/models/nlp/ponet/configuration.py
similarity index 96%
rename from modelscope/models/nlp/ponet/configuration_ponet.py
rename to modelscope/models/nlp/ponet/configuration.py
index 70294fc2..7dfaba48 100644
--- a/modelscope/models/nlp/ponet/configuration_ponet.py
+++ b/modelscope/models/nlp/ponet/configuration.py
@@ -34,8 +34,7 @@ class PoNetConfig(PretrainedConfig):
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
+            :obj:`inputs_ids` passed.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -55,8 +54,7 @@ class PoNetConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
diff --git a/modelscope/models/nlp/ponet/fill_mask.py b/modelscope/models/nlp/ponet/fill_mask.py
new file mode 100644
index 00000000..fb09efc0
--- /dev/null
+++ b/modelscope/models/nlp/ponet/fill_mask.py
@@ -0,0 +1,252 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from .backbone import PoNetModel, PoNetPreTrainedModel
+
+logger = get_logger(__name__)
+
+
+class PoNetPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class PoNetLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = PoNetPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class PoNetOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = PoNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
+class PoNetForMaskedLM(PoNetPreTrainedModel):
+    r"""PoNet Model with a `language modeling` head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of PoNet, the preprocessor of this model
+        is `modelscope.preprocessors.FillMaskPoNetPreprocessor`.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.ponet = PoNetModel(config, add_pooling_layer=False)
+        self.cls = PoNetOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        segment_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length', hidden_size)`,
+            `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可[MASK]不动我。'))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ponet(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            segment_ids=segment_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
diff --git a/modelscope/models/nlp/ponet/tokenization_ponet.py b/modelscope/models/nlp/ponet/tokenization.py
similarity index 98%
rename from modelscope/models/nlp/ponet/tokenization_ponet.py
rename to modelscope/models/nlp/ponet/tokenization.py
index 21544886..2da91545 100644
--- a/modelscope/models/nlp/ponet/tokenization_ponet.py
+++ b/modelscope/models/nlp/ponet/tokenization.py
@@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from transformers.file_utils import PaddingStrategy
 from transformers.models.bert.tokenization_bert import BertTokenizer
+from transformers.tokenization_utils import BatchEncoding, EncodedInput
 
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
diff --git a/modelscope/models/nlp/ponet_for_masked_language.py b/modelscope/models/nlp/ponet_for_masked_language.py
deleted file mode 100644
index 11f4bc11..00000000
--- a/modelscope/models/nlp/ponet_for_masked_language.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.ponet import \
-    PoNetForMaskedLM as PoNetForMaskedLMTransformer
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-__all__ = ['PoNetForMaskedLM']
-
-
-@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
-class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer):
-    """PoNet for MLM model.'.
-
-    Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets.
-    """
-
-    def __init__(self, config, model_dir):
-        super(TorchModel, self).__init__(model_dir)
-        PoNetForMaskedLMTransformer.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                segment_ids=None,
-                position_ids=None,
-                head_mask=None,
-                labels=None):
-        output = PoNetForMaskedLMTransformer.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            segment_ids=segment_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            labels=labels)
-        output[OutputKeys.INPUT_IDS] = input_ids
-        return output
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        model_dir = kwargs.get('model_dir')
-        return super(PoNetForMaskedLMTransformer,
-                     PoNetForMaskedLM).from_pretrained(
-                         pretrained_model_name_or_path=model_dir,
-                         model_dir=model_dir)
diff --git a/modelscope/models/nlp/sentence_embedding.py b/modelscope/models/nlp/sentence_embedding.py
deleted file mode 100644
index 340c133f..00000000
--- a/modelscope/models/nlp/sentence_embedding.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-import numpy as np
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SentenceEmbedding']
-
-
-@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
-class SentenceEmbedding(TorchModel, SbertPreTrainedModel):
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        self.config = config
-        setattr(self, self.base_model_prefix, self.build_base_model())
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=False)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        return self.base_model(**input)
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
-        num_sent = embs.shape[0]
-        if num_sent >= 2:
-            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
-                                                      (1, 0))).tolist()[0]
-        else:
-            scores = []
-        result = {'text_embedding': embs, 'scores': scores}
-
-        return result
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        model_args = {}
-
-        return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained(
-            pretrained_model_name_or_path=kwargs.get('model_dir'),
-            model_dir=kwargs.get('model_dir'),
-            **model_args)
diff --git a/modelscope/models/nlp/sequence_classification.py b/modelscope/models/nlp/sequence_classification.py
deleted file mode 100644
index 156c615c..00000000
--- a/modelscope/models/nlp/sequence_classification.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from abc import abstractmethod
-
-from torch import nn
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import BertPreTrainedModel
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.models.nlp.veco import \
-    VecoForSequenceClassification as VecoForSequenceClassificationTransform
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
-
-__all__ = [
-    'SbertForSequenceClassification', 'VecoForSequenceClassification',
-    'BertForSequenceClassification'
-]
-
-
-class SequenceClassificationBase(TorchModel):
-    """A sequence classification base class for all the fitted sequence classification models.
-    """
-    base_model_prefix: str = 'bert'
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        self.num_labels = config.num_labels
-        self.config = config
-        setattr(self, self.base_model_prefix, self.build_base_model())
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    @abstractmethod
-    def build_base_model(self):
-        """Build the backbone model.
-
-        Returns: the backbone instance.
-        """
-        pass
-
-    @property
-    def base_model(self):
-        return getattr(self, self.base_model_prefix)
-
-    def forward(self, **kwargs):
-        labels = None
-        if OutputKeys.LABEL in kwargs:
-            labels = kwargs.pop(OutputKeys.LABEL)
-        elif OutputKeys.LABELS in kwargs:
-            labels = kwargs.pop(OutputKeys.LABELS)
-
-        outputs = self.base_model.forward(**kwargs)
-
-        # backbone model should return pooled_output as its second output
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
-        return {OutputKeys.LOGITS: logits}
-
-    def postprocess(self, input, **kwargs):
-        logits = input[OutputKeys.LOGITS]
-        probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1)))
-        pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1)))
-        logits = torch_nested_numpify(torch_nested_detach(logits))
-        res = {
-            OutputKeys.PREDICTIONS: pred,
-            OutputKeys.PROBABILITIES: probs,
-            OutputKeys.LOGITS: logits
-        }
-        return res
-
-
-@MODELS.register_module(
-    Tasks.sentence_similarity, module_name=Models.structbert)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.structbert)
-@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
-@MODELS.register_module(
-    Tasks.zero_shot_classification, module_name=Models.structbert)
-class SbertForSequenceClassification(SequenceClassificationBase,
-                                     SbertPreTrainedModel):
-    """Sbert sequence classification model.
-
-    Inherited from SequenceClassificationBase.
-    """
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            SbertForSequenceClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=True)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                labels=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            labels=labels)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-            cls.id2label = {id: label for label, id in label2id.items()}
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(SbertPreTrainedModel,
-                     SbertForSequenceClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
-
-
-@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.veco)
-@MODELS.register_module(Tasks.nli, module_name=Models.veco)
-class VecoForSequenceClassification(TorchModel,
-                                    VecoForSequenceClassificationTransform):
-    """Veco sequence classification model.
-
-    Inherited from VecoForSequenceClassification and TorchModel, so this class can be registered into the model set.
-    This model cannot be inherited from SequenceClassificationBase, because Veco/XlmRoberta's classification structure
-    is different.
-    """
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        VecoForSequenceClassificationTransform.__init__(self, config)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                **kwargs):
-        return VecoForSequenceClassificationTransform.forward(
-            self,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            labels=labels)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by veco.VecoForSequenceClassification.from_pretrained
-        """
-
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(VecoForSequenceClassificationTransform,
-                     VecoForSequenceClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
-
-
-@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=Models.bert)
-@MODELS.register_module(Tasks.nli, module_name=Models.bert)
-@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
-class BertForSequenceClassification(SequenceClassificationBase,
-                                    BertPreTrainedModel):
-    """Bert sequence classification model.
-
-        Inherited from SequenceClassificationBase.
-    """
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            BertForSequenceClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .bert import BertModel
-        return BertModel(self.config, add_pooling_layer=True)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(BertPreTrainedModel,
-                     BertForSequenceClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
diff --git a/modelscope/models/nlp/space/__init__.py b/modelscope/models/nlp/space/__init__.py
index 45f856c1..32713c34 100644
--- a/modelscope/models/nlp/space/__init__.py
+++ b/modelscope/models/nlp/space/__init__.py
@@ -1,20 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .model import SpaceGenerator
-    from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig
-    from .space_for_dialog_intent_prediction import SpaceForDialogIntent
-    from .space_for_dialog_modeling import SpaceForDialogModeling
-    from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
+    from .model import SpaceModelBase, SpaceTokenizer
+    from .dialog_intent_prediction import SpaceForDialogIntent
+    from .dialog_modeling import SpaceForDialogModeling
+    from .dialog_state_tracking import SpaceForDST
+    from .configuration import SpaceConfig
 else:
     _import_structure = {
-        'model':
-        ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'],
-        'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
-        'space_for_dialog_modeling': ['SpaceForDialogModeling'],
-        'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
+        'model': ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer'],
+        'dialog_intent_prediction': ['SpaceForDialogIntent'],
+        'dialog_modeling': ['SpaceForDialogModeling'],
+        'dialog_state_tracking': ['SpaceForDST'],
+        'configuration': ['SpaceConfig']
     }
 
     import sys
diff --git a/modelscope/models/nlp/space/model/configuration_space.py b/modelscope/models/nlp/space/configuration.py
similarity index 100%
rename from modelscope/models/nlp/space/model/configuration_space.py
rename to modelscope/models/nlp/space/configuration.py
diff --git a/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py b/modelscope/models/nlp/space/dialog_intent_prediction.py
similarity index 66%
rename from modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
rename to modelscope/models/nlp/space/dialog_intent_prediction.py
index b93a6d83..79ff01cd 100644
--- a/modelscope/models/nlp/space/space_for_dialog_intent_prediction.py
+++ b/modelscope/models/nlp/space/dialog_intent_prediction.py
@@ -8,7 +8,7 @@ from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
-from modelscope.preprocessors.space import IntentBPETextField
+from modelscope.preprocessors.nlp import IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -24,6 +24,10 @@ class SpaceForDialogIntent(TorchModel):
 
         Args:
             model_dir (str): the model path.
+            text_field (`BPETextField`, *optional*, defaults to `IntentBPETextField`):
+                The text field.
+            config (`Config`, *optional*, defaults to config in model hub):
+                The config.
         """
 
         super().__init__(model_dir, *args, **kwargs)
@@ -72,10 +76,21 @@ class SpaceForDialogIntent(TorchModel):
                 Example:
                     {
                         'pred': array([2.62349960e-03 4.12110658e-03 4.12748595e-05 3.77560973e-05
- 1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04
- 6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01
- 2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32)
+                                1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04
+                                6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01
+                                2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32),
                     }
+        Example:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import SpaceForDialogIntent
+            >>> from modelscope.preprocessors import DialogIntentPredictionPreprocessor
+            >>> cache_path = snapshot_download('damo/nlp_space_dialog-intent-prediction')
+            >>> preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
+            >>> model = SpaceForDialogIntent(
+                    model_dir=cache_path,
+                    text_field=preprocessor.text_field,
+                    config=preprocessor.config)
+            >>> print(model(preprocessor("What do I need to do for the card activation?")))
         """
         import numpy as np
         pred = self.trainer.forward(input)
diff --git a/modelscope/models/nlp/space/space_for_dialog_modeling.py b/modelscope/models/nlp/space/dialog_modeling.py
similarity index 73%
rename from modelscope/models/nlp/space/space_for_dialog_modeling.py
rename to modelscope/models/nlp/space/dialog_modeling.py
index efa9b851..16e9dc53 100644
--- a/modelscope/models/nlp/space/space_for_dialog_modeling.py
+++ b/modelscope/models/nlp/space/dialog_modeling.py
@@ -8,7 +8,7 @@ from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
-from modelscope.preprocessors.space import MultiWOZBPETextField
+from modelscope.preprocessors.nlp import MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
@@ -23,7 +23,12 @@ class SpaceForDialogModeling(TorchModel):
         """initialize the test generation model from the `model_dir` path.
 
         Args:
-            model_dir (str): the model path.
+            model_dir (`str`):
+                The model path.
+            text_field (`BPETextField`, *optional*, defaults to `MultiWOZBPETextField`):
+                The text field.
+            config (`Config`, *optional*, defaults to config in model hub):
+                The config.
         """
 
         super().__init__(model_dir, *args, **kwargs)
@@ -82,6 +87,19 @@ class SpaceForDialogModeling(TorchModel):
                         'aspn': array([47,8345,32,29,1983]),
                         'db': array([19, 24, 20]),
                     }
+        Examples:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import SpaceForDialogModeling
+            >>> from modelscope.preprocessors import DialogModelingPreprocessor
+            >>> cache_path = snapshot_download('damo/nlp_space_dialog-modeling')
+            >>> preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
+            >>> model = SpaceForDialogModeling(model_dir=cache_path,
+                    text_field=preprocessor.text_field,
+                    config=preprocessor.config)
+            >>> print(model(preprocessor({
+                    'user_input': 'i would like a taxi from saint john \'s college to pizza hut fen ditton .',
+                    'history': {}
+                })))
         """
 
         first_turn = input['first_turn']
diff --git a/modelscope/models/nlp/space/model/modeling_space.py b/modelscope/models/nlp/space/dialog_state_tracking.py
similarity index 57%
rename from modelscope/models/nlp/space/model/modeling_space.py
rename to modelscope/models/nlp/space/dialog_state_tracking.py
index f093cbc5..9a713a59 100644
--- a/modelscope/models/nlp/space/model/modeling_space.py
+++ b/modelscope/models/nlp/space/dialog_state_tracking.py
@@ -1,6 +1,6 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,14 +16,22 @@
 # limitations under the License.
 """PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
 
+from typing import Dict
+
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.file_utils import add_start_docstrings
+from transformers.modeling_utils import PreTrainedModel
 
-from modelscope.models.nlp.structbert.modeling_sbert import (
-    SbertForMaskedLM, SbertModel, SbertPreTrainedModel)
-from .configuration_space import SpaceConfig
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.models.nlp.structbert import (SbertForMaskedLM, SbertModel,
+                                              SbertPreTrainedModel)
+from modelscope.utils.constant import Tasks
+from .configuration import SpaceConfig
 
 SPACE_START_DOCSTRING = r"""
 
@@ -57,6 +65,63 @@ class SpaceModel(SbertModel):
     config_class = SpaceConfig
 
 
+class SpacePreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SpaceConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        @param kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = SpaceConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
+
+
 @add_start_docstrings(
     """
     Space Model transformer with Dialog state tracking heads on top (a inform projection
@@ -65,7 +130,9 @@ class SpaceModel(SbertModel):
     """,
     SPACE_START_DOCSTRING,
 )
-class SpaceForDST(SbertPreTrainedModel):
+@MODELS.register_module(
+    Tasks.task_oriented_conversation, module_name=Models.space_dst)
+class SpaceForDST(SpacePreTrainedModel):
 
     def __init__(self, config):
         super(SpaceForDST, self).__init__(config)
@@ -113,18 +180,105 @@ class SpaceForDST(SbertPreTrainedModel):
 
         self.init_weights()
 
-    def forward(self,
-                input_ids,
-                input_mask=None,
-                segment_ids=None,
-                position_ids=None,
-                head_mask=None,
-                start_pos=None,
-                end_pos=None,
-                inform_slot_id=None,
-                refer_id=None,
-                class_label_id=None,
-                diag_state=None):
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'inputs': dict(input_ids, input_masks,start_pos), # tracking states
+                        'outputs': dict(slots_logits),
+                        'unique_ids': str(test-example.json-0), # default value
+                        'input_ids_unmasked': array([101, 7632, 1010,0,0,0])
+                        'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
+                        'inform':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
+                        'prefix': str('final'), #default value
+                        'ds':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}])
+                    }
+
+        Example:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import SpaceForDST
+            >>> from modelscope.preprocessors import DialogStateTrackingPreprocessor
+            >>> cache_path = snapshot_download('damo/nlp_space_dialog-state-tracking')
+            >>> model = SpaceForDST.from_pretrained(cache_path)
+            >>> preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path)
+            >>> print(model(preprocessor({
+                    'utter': {
+                        'User-1': "Hi, I'm looking for a train that is going"
+                            "to cambridge and arriving there by 20:45, is there anything like that?"
+                    },
+                    'history_states': [{}]
+                })))
+        """
+        import numpy as np
+        import torch
+
+        # self.model.eval() ????
+        batch = input['batch']
+
+        features = input['features']
+        diag_state = input['diag_state']
+        turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]]
+        reset_diag_state = np.where(np.array(turn_itrs) == '0')[0]
+        for slot in self.config.dst_slot_list:
+            for i in reset_diag_state:
+                diag_state[slot][i] = 0
+
+        with torch.no_grad():
+            inputs = {
+                'input_ids': batch[0],
+                'input_mask': batch[1],
+                'segment_ids': batch[2],
+                'start_pos': batch[3],
+                'end_pos': batch[4],
+                'inform_slot_id': batch[5],
+                'refer_id': batch[6],
+                'diag_state': diag_state,
+                'class_label_id': batch[8]
+            }
+            unique_ids = [features[i.item()].guid for i in batch[9]]
+            values = [features[i.item()].values for i in batch[9]]
+            input_ids_unmasked = [
+                features[i.item()].input_ids_unmasked for i in batch[9]
+            ]
+            inform = [features[i.item()].inform for i in batch[9]]
+            outputs = self._forward(**inputs)
+
+            # Update dialog state for next turn.
+            for slot in self.config.dst_slot_list:
+                updates = outputs[2][slot].max(1)[1]
+                for i, u in enumerate(updates):
+                    if u != 0:
+                        diag_state[slot][i] = u
+
+        return {
+            'inputs': inputs,
+            'outputs': outputs,
+            'unique_ids': unique_ids,
+            'input_ids_unmasked': input_ids_unmasked,
+            'values': values,
+            'inform': inform,
+            'prefix': 'final',
+            'ds': input['ds']
+        }
+
+    def _forward(self,
+                 input_ids,
+                 input_mask=None,
+                 segment_ids=None,
+                 position_ids=None,
+                 head_mask=None,
+                 start_pos=None,
+                 end_pos=None,
+                 inform_slot_id=None,
+                 refer_id=None,
+                 class_label_id=None,
+                 diag_state=None):
         outputs = self.bert(
             input_ids,
             attention_mask=input_mask,
@@ -132,8 +286,8 @@ class SpaceForDST(SbertPreTrainedModel):
             position_ids=position_ids,
             head_mask=head_mask)
 
-        sequence_output = outputs[0]
-        pooled_output = outputs[1]
+        sequence_output = outputs.last_hidden_state
+        pooled_output = outputs.pooler_output
 
         sequence_output = self.dropout(sequence_output)
         pooled_output = self.dropout(pooled_output)
@@ -233,36 +387,6 @@ class SpaceForDST(SbertPreTrainedModel):
             per_slot_start_logits,
             per_slot_end_logits,
             per_slot_refer_logits,
-        ) + outputs[2:]
+        ) + (outputs.embedding_output, )
 
         return outputs
-
-
-@add_start_docstrings(
-    'The Space Model Model with a `language modeling` head on tops',
-    SPACE_START_DOCSTRING,
-)
-class SpaceForMaskedLM(SbertForMaskedLM):
-    """
-    This class overrides [`SbertForMaskedLM`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = SpaceConfig
-
-
-@add_start_docstrings(
-    """
-    Space Model with only one head on top as done during the pretraining: a `masked language modeling` head.
-    """,
-    SPACE_START_DOCSTRING,
-)
-class SpaceForPreTraining(SbertPreTrainedModel):
-
-    def __init__(self, model_name_or_path: str):
-        super(SpaceForPreTraining, self).__init__()
-        self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path)
-
-    def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor):
-        outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels)
-        return outputs[0]
diff --git a/modelscope/models/nlp/space/model/__init__.py b/modelscope/models/nlp/space/model/__init__.py
index bb1d18e4..cfff335d 100644
--- a/modelscope/models/nlp/space/model/__init__.py
+++ b/modelscope/models/nlp/space/model/__init__.py
@@ -1,10 +1,8 @@
-from .configuration_space import SpaceConfig
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from .gen_unified_transformer import GenUnifiedTransformer
 from .generator import SpaceGenerator
 from .intent_unified_transformer import IntentUnifiedTransformer
 from .model_base import SpaceModelBase
-from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
-                             SpaceForPreTraining, SpaceModel)
 from .tokenization_space import (BasicTokenizer, SpaceTokenizer,
                                  WordpieceTokenizer)
 from .unified_transformer import UnifiedTransformer
diff --git a/modelscope/models/nlp/space/model/generator.py b/modelscope/models/nlp/space/model/generator.py
index 0e7833e6..2e05b545 100644
--- a/modelscope/models/nlp/space/model/generator.py
+++ b/modelscope/models/nlp/space/model/generator.py
@@ -71,14 +71,11 @@ class SpaceGenerator(object):
         return
 
     def __call__(self, step_fn, state):
-        """
-        Running generation.
-
-        @param : step_fn : decoding one step
-        @type : function
+        """Running generation.
 
-        @param : state : initial state
-        @type : dict
+        Args:
+            step_fn (`function`) : decoding one step
+            state(`dict`) : initial state
         """
         raise NotImplementedError
 
@@ -104,11 +101,9 @@ class BeamSearch(SpaceGenerator):
         """
         Running beam search.
 
-        @param : step_fn : decoding one step
-        @type : function
-
-        @param : state : initial state
-        @type : dict
+        Args:
+            step_fn(`function`) : decoding one step
+            state(`dict`) : initial state
         """
         if prev_input is not None:
 
diff --git a/modelscope/models/nlp/space/model/model_base.py b/modelscope/models/nlp/space/model/model_base.py
index d3d0baa4..b7812182 100644
--- a/modelscope/models/nlp/space/model/model_base.py
+++ b/modelscope/models/nlp/space/model/model_base.py
@@ -64,8 +64,8 @@ class SpaceModelBase(nn.Module):
         """
         Forward process, include real forward, collect metrices and optimize(optional)
 
-        @params : inputs : input data
-        @type : dict of numpy.ndarray/int/float/...
+        Args:
+            inputs(`dict` of numpy.ndarray/int/float/...) : input data
         """
         if is_training:
             self.train()
@@ -85,11 +85,10 @@ class SpaceModelBase(nn.Module):
               eos_id=None,
               max_gen_len=None,
               prev_input=None):
-        """
-        Inference process.
+        """Inference process.
 
-        @params : inputs : input data
-        @type : dict of numpy.ndarray/int/float/...
+        Args:
+            inputs(`dict` of numpy.ndarray/int/float/...) : input data
         """
         self.eval()
         results = self._infer(
diff --git a/modelscope/models/nlp/space/model/tokenization_space.py b/modelscope/models/nlp/space/model/tokenization_space.py
index 84712b7b..e3b358d4 100644
--- a/modelscope/models/nlp/space/model/tokenization_space.py
+++ b/modelscope/models/nlp/space/model/tokenization_space.py
@@ -1,5 +1,5 @@
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py
index b0775541..19069971 100644
--- a/modelscope/models/nlp/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -119,15 +119,12 @@ class UnifiedTransformer(SpaceModelBase):
                      input_mask,
                      append_head=False,
                      auto_regressive=False):
-        """
-        Create attention mask.
+        """Create attention mask.
         from sequence to matrix：[batch_size, max_seq_len， 1] -> [batch_size, max_seq_len, max_seq_len]
 
-        @param : input_mask
-        @type : Variable(shape: [batch_size, max_seq_len])
-
-        @param : auto_regressive
-        @type : bool
+        Args:
+            input_mask (Variable(shape: [batch_size, max_seq_len]))
+            auto_regressive(bool)
         """
         seq_len = input_mask.shape[1]
 
@@ -150,15 +147,12 @@ class UnifiedTransformer(SpaceModelBase):
         return mask
 
     def _join_mask(self, mask1, mask2):
-        """
-        Merge source attention mask and target attention mask.
+        """Merge source attention mask and target attention mask.
         There are four parts：left upper (lu) / right upper (ru) / left below (lb) / right below (rb)
 
-        @param : mask1 : source attention mask
-        @type : Variable(shape: [batch_size, max_src_len, max_src_len])
-
-        @param : mask1 : target attention mask
-        @type : Variable(shape: [batch_size, max_tgt_len, max_tgt_len])
+        Args:
+            mask1(Variable(shape: [batch_size, max_src_len, max_src_len])) : source attention mask
+            mask2(Variable(shape: [batch_size, max_tgt_len, max_tgt_len])) : target attention mask
         """
         batch_size = mask1.shape[0]
         seq_len1 = mask1.shape[1]
diff --git a/modelscope/models/nlp/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py
index 37f968d9..3044963a 100644
--- a/modelscope/models/nlp/space/modules/transformer_block.py
+++ b/modelscope/models/nlp/space/modules/transformer_block.py
@@ -30,18 +30,13 @@ class TransformerBlock(nn.Module):
         return
 
     def forward(self, inp, mask=None, cache=None):
-        """
-        Forward process on one transformer layer.
-
-        @param : x
-        @type : Variable(shape: [batch_size, seq_len, hidden_size])
-
-        @param : memory
-        @type : Variable(shape: [batch_size, seq_len, hidden_size])
-
-        @param : mask
+        """Forward process on one transformer layer.
 
-        @param : cache
+        Args:
+            x(Variable(shape: [batch_size, seq_len, hidden_size]))
+            memory(Variable(shape: [batch_size, seq_len, hidden_size]))
+            mask
+            cache
         """
         attn_out = self.attn(inp, mask, cache)
         attn_out = self.dropout_layer(attn_out)
diff --git a/modelscope/models/nlp/space/space_for_dialog_state_tracking.py b/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
deleted file mode 100644
index 4b9cf5c3..00000000
--- a/modelscope/models/nlp/space/space_for_dialog_state_tracking.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from typing import Dict
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Tasks
-
-__all__ = ['SpaceForDialogStateTracking']
-
-
-@MODELS.register_module(
-    Tasks.task_oriented_conversation, module_name=Models.space_dst)
-class SpaceForDialogStateTracking(TorchModel):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """initialize the test generation model from the `model_dir` path.
-
-        Args:
-            model_dir (str): the model path.
-        """
-
-        super().__init__(model_dir, *args, **kwargs)
-
-        from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig
-        self.model_dir = model_dir
-
-        self.config = SpaceConfig.from_pretrained(self.model_dir)
-        self.model = SpaceForDST.from_pretrained(self.model_dir)
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Tensor]): the preprocessed data
-
-        Returns:
-            Dict[str, Tensor]: results
-                Example:
-                    {
-                        'inputs': dict(input_ids, input_masks,start_pos), # tracking states
-                        'outputs': dict(slots_logits),
-                        'unique_ids': str(test-example.json-0), # default value
-                        'input_ids_unmasked': array([101, 7632, 1010,0,0,0])
-                        'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
-                        'inform':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
-                        'prefix': str('final'), #default value
-                        'ds':  array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}])
-                    }
-        """
-        import numpy as np
-        import torch
-
-        self.model.eval()
-        batch = input['batch']
-
-        features = input['features']
-        diag_state = input['diag_state']
-        turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]]
-        reset_diag_state = np.where(np.array(turn_itrs) == '0')[0]
-        for slot in self.config.dst_slot_list:
-            for i in reset_diag_state:
-                diag_state[slot][i] = 0
-
-        with torch.no_grad():
-            inputs = {
-                'input_ids': batch[0],
-                'input_mask': batch[1],
-                'segment_ids': batch[2],
-                'start_pos': batch[3],
-                'end_pos': batch[4],
-                'inform_slot_id': batch[5],
-                'refer_id': batch[6],
-                'diag_state': diag_state,
-                'class_label_id': batch[8]
-            }
-            unique_ids = [features[i.item()].guid for i in batch[9]]
-            values = [features[i.item()].values for i in batch[9]]
-            input_ids_unmasked = [
-                features[i.item()].input_ids_unmasked for i in batch[9]
-            ]
-            inform = [features[i.item()].inform for i in batch[9]]
-            outputs = self.model(**inputs)
-
-            # Update dialog state for next turn.
-            for slot in self.config.dst_slot_list:
-                updates = outputs[2][slot].max(1)[1]
-                for i, u in enumerate(updates):
-                    if u != 0:
-                        diag_state[slot][i] = u
-
-        return {
-            'inputs': inputs,
-            'outputs': outputs,
-            'unique_ids': unique_ids,
-            'input_ids_unmasked': input_ids_unmasked,
-            'values': values,
-            'inform': inform,
-            'prefix': 'final',
-            'ds': input['ds']
-        }
diff --git a/modelscope/models/nlp/space_T_cn/__init__.py b/modelscope/models/nlp/space_T_cn/__init__.py
index e69de29b..b9deb700 100644
--- a/modelscope/models/nlp/space_T_cn/__init__.py
+++ b/modelscope/models/nlp/space_T_cn/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .table_question_answering import TableQuestionAnswering
+else:
+    _import_structure = {
+        'table_question_answering': ['TableQuestionAnswering']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py b/modelscope/models/nlp/space_T_cn/backbone.py
similarity index 99%
rename from modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
rename to modelscope/models/nlp/space_T_cn/backbone.py
index 72c94724..5afde06e 100644
--- a/modelscope/models/nlp/space_T_cn/modeling_space_T_cn.py
+++ b/modelscope/models/nlp/space_T_cn/backbone.py
@@ -1,6 +1,6 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,8 +27,7 @@ import numpy as np
 import torch
 from torch import nn
 
-from modelscope.models.nlp.space_T_cn.configuration_space_T_cn import \
-    SpaceTCnConfig
+from modelscope.models.nlp.space_T_cn.configuration import SpaceTCnConfig
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 
diff --git a/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py b/modelscope/models/nlp/space_T_cn/configuration.py
similarity index 100%
rename from modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
rename to modelscope/models/nlp/space_T_cn/configuration.py
index 553d8592..e698b310 100644
--- a/modelscope/models/nlp/space_T_cn/configuration_space_T_cn.py
+++ b/modelscope/models/nlp/space_T_cn/configuration.py
@@ -1,6 +1,6 @@
+# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/space_T_cn/table_question_answering.py
similarity index 94%
rename from modelscope/models/nlp/table_question_answering.py
rename to modelscope/models/nlp/space_T_cn/table_question_answering.py
index 8e05dd0f..a3f504b7 100644
--- a/modelscope/models/nlp/table_question_answering.py
+++ b/modelscope/models/nlp/space_T_cn/table_question_answering.py
@@ -11,11 +11,11 @@ from transformers import BertTokenizer
 from modelscope.metainfo import Models
 from modelscope.models.base import Model, Tensor
 from modelscope.models.builder import MODELS
-from modelscope.preprocessors.space_T_cn.fields.struct import Constant
+from modelscope.preprocessors.nlp.space_T_cn.fields.struct import Constant
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import verify_device
-from .space_T_cn.configuration_space_T_cn import SpaceTCnConfig
-from .space_T_cn.modeling_space_T_cn import Seq2SQL, SpaceTCnModel
+from .backbone import Seq2SQL, SpaceTCnModel
+from .configuration import SpaceTCnConfig
 
 __all__ = ['TableQuestionAnswering']
 
@@ -732,9 +732,41 @@ class TableQuestionAnswering(Model):
         Args:
             input (Dict[str, Tensor]): the preprocessed data
 
+
         Returns:
             Dict[str, Tensor]: results
                 Example:
+                    {
+                        'result':
+                            {
+                                'question_tok': ['有', '哪', '些', '风', '险', '类', '型', '？'],
+                                'question': '有哪些风险类型？',
+                                'table_id': 'fund',
+                                'sql': {
+                                    'cond_conn_op': 0,
+                                    'sel': [5],
+                                    'agg': [0],
+                                    'conds': [[10, 2, 'Nulll']]
+                                },
+                                'action': 10,
+                                'model_out': [
+                                    [6, 0, 0, 0],
+                                    [0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0, 0],
+                                    [2, 0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0, 0]
+                                ]
+                            },
+                        'history_sql': None
+                    }
+
+        Example:
+            >>> from modelscope.models.nlp import TableQuestionAnswering
+            >>> from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
+            >>> model = TableQuestionAnswering.from_pretrained('damo/nlp_convai_text2sql_pretrain_cn')
+            >>> preprocessor = TableQuestionAnsweringPreprocessor(model_dir=model.model_dir)
+            >>> print(model(preprocessor({'question': '有哪些风险类型？'})))
         """
         result = self.predict(input['datas'])[0]
 
diff --git a/modelscope/models/nlp/space_T_en/__init__.py b/modelscope/models/nlp/space_T_en/__init__.py
new file mode 100644
index 00000000..46c8b38c
--- /dev/null
+++ b/modelscope/models/nlp/space_T_en/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .text_to_sql import StarForTextToSql
+else:
+    _import_structure = {
+        'text_to_sql': ['StarForTextToSql'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/star_text_to_sql.py b/modelscope/models/nlp/space_T_en/text_to_sql.py
similarity index 59%
rename from modelscope/models/nlp/star_text_to_sql.py
rename to modelscope/models/nlp/space_T_en/text_to_sql.py
index 089f1c89..ca2d2596 100644
--- a/modelscope/models/nlp/star_text_to_sql.py
+++ b/modelscope/models/nlp/space_T_en/text_to_sql.py
@@ -4,14 +4,13 @@ import os
 from typing import Dict, Optional
 
 import torch
-import torch.nn as nn
 from text2sql_lgesql.asdl.asdl import ASDLGrammar
 from text2sql_lgesql.asdl.transition_system import TransitionSystem
 from text2sql_lgesql.model.model_constructor import Text2SQL
-from text2sql_lgesql.utils.constants import GRAMMAR_FILEPATH
 
 from modelscope.metainfo import Models
-from modelscope.models.base import Model, Tensor
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
@@ -21,7 +20,7 @@ __all__ = ['StarForTextToSql']
 
 @MODELS.register_module(
     Tasks.table_question_answering, module_name=Models.space_T_en)
-class StarForTextToSql(Model):
+class StarForTextToSql(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the star model from the `model_dir` path.
@@ -59,6 +58,33 @@ class StarForTextToSql(Model):
         Returns:
             Dict[str, Tensor]: results
                 Example:
+
+        Example:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.models.nlp import StarForTextToSql
+            >>> from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
+            >>> test_case = {
+                    'database_id': 'employee_hire_evaluation',
+                    'local_db_path': None,
+                    'utterance': [
+                        "I'd like to see Shop names.", 'Which of these are hiring?',
+                        'Which shop is hiring the highest number of employees?'
+                        ' | do you want the name of the shop ? | Yes'
+                    ]
+                }
+            >>> cache_path = snapshot_download('damo/nlp_star_conversational-text-to-sql')
+            >>> preprocessor = ConversationalTextToSqlPreprocessor(
+                    model_dir=cache_path,
+                    database_id=test_case['database_id'],
+                db_content=True)
+            >>> model = StarForTextToSql(cache_path, config=preprocessor.config)
+            >>> print(model(preprocessor({
+                    'utterance': "I'd like to see Shop names.",
+                    'history': [],
+                    'last_sql': '',
+                    'database_id': 'employee_hire_evaluation',
+                    'local_db_path': None
+                })))
         """
         self.model.eval()
         hyps = self.model.parse(input['batch'], self.beam_size)  #
diff --git a/modelscope/models/nlp/structbert/__init__.py b/modelscope/models/nlp/structbert/__init__.py
index d42db83c..60d369e0 100644
--- a/modelscope/models/nlp/structbert/__init__.py
+++ b/modelscope/models/nlp/structbert/__init__.py
@@ -18,20 +18,26 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_sbert import SbertConfig
-    from .modeling_sbert import (SbertForMaskedLM, SbertModel,
-                                 SbertPreTrainedModel)
-    from .tokenization_sbert import (BasicTokenizer, SbertTokenizer,
-                                     WordpieceTokenizer)
-    from .tokenization_sbert_fast import SbertTokenizerFast
+    from .backbone import (SbertModel, SbertPreTrainedModel)
+    from .configuration import SbertConfig
+    from .faq_question_answering import SbertForFaqQuestionAnswering
+    from .fill_mask import SbertForMaskedLM
+    from .text_classification import SbertForSequenceClassification
+    from .token_classification import SbertForTokenClassification
+    from .tokenization import (BasicTokenizer, SbertTokenizer,
+                               WordpieceTokenizer)
+    from .tokenization_fast import SbertTokenizerFast
 else:
     _import_structure = {
-        'configuration_sbert': ['SbertConfig'],
-        'modeling_sbert':
-        ['SbertForMaskedLM', 'SbertModel', 'SbertPreTrainedModel'],
-        'tokenization_sbert':
+        'backbone': ['SbertModel', 'SbertPreTrainedModel'],
+        'configuration': ['SbertConfig'],
+        'fill_mask': ['SbertForMaskedLM'],
+        'faq_question_answering': ['SbertForFaqQuestionAnswering'],
+        'text_classification': ['SbertForSequenceClassification'],
+        'token_classification': ['SbertForTokenClassification'],
+        'tokenization':
         ['BasicTokenizer', 'SbertTokenizer', 'WordpieceTokenizer'],
-        'tokenization_sbert_fast': ['SbertTokenizerFast'],
+        'tokenization_fast': ['SbertTokenizerFast'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py
new file mode 100755
index 00000000..039db3ce
--- /dev/null
+++ b/modelscope/models/nlp/structbert/backbone.py
@@ -0,0 +1,932 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch StructBERT model. mainly copied from :module:`~transformers.modeling_bert`"""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from packaging import version
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import \
+    BaseModelOutputWithPastAndCrossAttentions
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from modelscope.utils.logger import get_logger
+from .configuration import SbertConfig
+
+logger = get_logger(__name__)
+
+
+class SbertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(
+                    self.position_ids.size(),
+                    dtype=torch.long,
+                    device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                return_inputs_embeds=False):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users
+        # when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if not return_inputs_embeds:
+            return embeddings
+        else:
+            return embeddings, inputs_embeds
+
+
+class SbertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class SbertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.self = SbertSelfAttention(config)
+        self.output = SbertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class SbertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class SbertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class SbertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = SbertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = SbertAttention(config)
+        self.intermediate = SbertIntermediate(config)
+        self.output = SbertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
+                    f'layers by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class SbertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SbertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SbertPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SbertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, SbertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = SbertConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
+
+
+@dataclass
+class AttentionBackboneModelOutputWithEmbedding(AttentionBackboneModelOutput):
+    embedding_output: torch.FloatTensor = None
+    logits: Optional[Union[tuple, torch.FloatTensor]] = None
+    kwargs: dict = None
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.structbert)
+class SbertModel(SbertPreTrainedModel):
+    """The StructBERT Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config: SbertConfig, add_pooling_layer=True, **kwargs):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = SbertEmbeddings(config)
+        self.encoder = SbertEncoder(config)
+
+        self.pooler = SbertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
+            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_backbone_base_std', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_backbone_base_std')
+            >>> print(model(**preprocessor('这是个测试')))
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output, orignal_embeds = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            return_inputs_embeds=True,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output,
+                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+
+        return AttentionBackboneModelOutputWithEmbedding(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+            embedding_output=orignal_embeds)
diff --git a/modelscope/models/nlp/structbert/configuration_sbert.py b/modelscope/models/nlp/structbert/configuration.py
similarity index 94%
rename from modelscope/models/nlp/structbert/configuration_sbert.py
rename to modelscope/models/nlp/structbert/configuration.py
index a727a978..8f095f9d 100644
--- a/modelscope/models/nlp/structbert/configuration_sbert.py
+++ b/modelscope/models/nlp/structbert/configuration.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SBERT model configuration, mainly copied from :class:`~transformers.BertConfig` """
+""" StructBERT model configuration, mainly copied from :class:`~transformers.BertConfig` """
 from transformers import PretrainedConfig
 
 from modelscope.utils import logger as logging
@@ -26,7 +26,7 @@ class SbertConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration
     of a :class:`~modelscope.models.nlp.structbert.SbertModel`.
-    It is used to instantiate a SBERT model according to the specified arguments.
+    It is used to instantiate a StructBERT model according to the specified arguments.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -74,15 +74,15 @@ class SbertConfig(PretrainedConfig):
             relevant if ``config.is_decoder=True``.
         classifier_dropout (:obj:`float`, `optional`):
             The dropout ratio for the classification head.
-        adv_grad_factor (:obj:`float`, `optional`): This factor will be multipled by the KL loss grad and then
+        adv_grad_factor (:obj:`float`, `optional`): This factor will be multiplied by the KL loss grad and then
             the result will be added to the original embedding.
             More details please check:https://arxiv.org/abs/1908.04577
-            The range of this value always be 1e-3~1e-7
+            The range of this value should between 1e-3~1e-7
         adv_bound (:obj:`float`, `optional`): adv_bound is used to cut the top and the bottom bound of
             the produced embedding.
-            If not proveded, 2 * sigma will be used as the adv_bound factor
+            If not provided, 2 * sigma will be used as the adv_bound factor
         sigma (:obj:`float`, `optional`): The std factor used to produce a 0 mean normal distribution.
-            If adv_bound not proveded, 2 * sigma will be used as the adv_bound factor
+            If adv_bound not provided, 2 * sigma will be used as the adv_bound factor
     """
 
     model_type = 'structbert'
diff --git a/modelscope/models/nlp/sbert_for_faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py
similarity index 74%
rename from modelscope/models/nlp/sbert_for_faq_question_answering.py
rename to modelscope/models/nlp/structbert/faq_question_answering.py
index 23ccdcc5..c8dbf302 100644
--- a/modelscope/models/nlp/sbert_for_faq_question_answering.py
+++ b/modelscope/models/nlp/structbert/faq_question_answering.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import math
 import os
 from collections import namedtuple
@@ -15,103 +17,6 @@ from modelscope.models.nlp.task_models.task_model import BaseTaskModel
 from modelscope.utils.config import Config, ConfigFields
 from modelscope.utils.constant import ModelFile, Tasks
 
-__all__ = ['SbertForFaqQuestionAnswering']
-
-
-class SbertForFaqQuestionAnsweringBase(BaseTaskModel):
-    """base class for faq models
-    """
-
-    def __init__(self, model_dir, *args, **kwargs):
-        super(SbertForFaqQuestionAnsweringBase,
-              self).__init__(model_dir, *args, **kwargs)
-
-        backbone_cfg = SbertConfig.from_pretrained(model_dir)
-        self.bert = SbertModel(backbone_cfg)
-
-        model_config = Config.from_file(
-            os.path.join(model_dir,
-                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
-
-        metric = model_config.get('metric', 'cosine')
-        pooling_method = model_config.get('pooling', 'avg')
-
-        Arg = namedtuple('args', [
-            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
-        ])
-        args = Arg(
-            metrics=metric,
-            proj_hidden_size=self.bert.config.hidden_size,
-            hidden_size=self.bert.config.hidden_size,
-            dropout=0.0,
-            pooling=pooling_method)
-
-        self.metrics_layer = MetricsLayer(args)
-        self.pooling = PoolingLayer(args)
-
-    def _get_onehot_labels(self, labels, support_size, num_cls):
-        labels_ = labels.view(support_size, 1)
-        target_oh = torch.zeros(support_size, num_cls).to(labels)
-        target_oh.scatter_(dim=1, index=labels_, value=1)
-        return target_oh.view(support_size, num_cls).float()
-
-    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
-        input_ids = inputs['input_ids']
-        input_mask = inputs['attention_mask']
-        if not isinstance(input_ids, Tensor):
-            input_ids = torch.IntTensor(input_ids)
-        if not isinstance(input_mask, Tensor):
-            input_mask = torch.IntTensor(input_mask)
-        rst = self.bert(input_ids, input_mask)
-        last_hidden_states = rst.last_hidden_state
-        if len(input_mask.shape) == 2:
-            input_mask = input_mask.unsqueeze(-1)
-        pooled_representation = self.pooling(last_hidden_states, input_mask)
-        return pooled_representation
-
-
-@MODELS.register_module(
-    Tasks.faq_question_answering, module_name=Models.structbert)
-class SbertForFaqQuestionAnswering(SbertForFaqQuestionAnsweringBase):
-    _backbone_prefix = ''
-
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        assert not self.training
-        query = input['query']
-        support = input['support']
-        if isinstance(query, list):
-            query = torch.stack(query)
-        if isinstance(support, list):
-            support = torch.stack(support)
-        n_query = query.shape[0]
-        n_support = support.shape[0]
-        query_mask = torch.ne(query, 0).view([n_query, -1])
-        support_mask = torch.ne(support, 0).view([n_support, -1])
-
-        support_labels = input['support_labels']
-        num_cls = torch.max(support_labels) + 1
-        onehot_labels = self._get_onehot_labels(support_labels, n_support,
-                                                num_cls)
-
-        input_ids = torch.cat([query, support])
-        input_mask = torch.cat([query_mask, support_mask], dim=0)
-        pooled_representation = self.forward_sentence_embedding({
-            'input_ids':
-            input_ids,
-            'attention_mask':
-            input_mask
-        })
-        z_query = pooled_representation[:n_query]
-        z_support = pooled_representation[n_query:]
-        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
-        protos = torch.matmul(onehot_labels.transpose(0, 1),
-                              z_support) / cls_n_support.unsqueeze(-1)
-        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
-        if self.metrics_layer.name == 'relation':
-            scores = torch.sigmoid(scores)
-        return {'scores': scores}
-
-
 activations = {
     'relu': F.relu,
     'tanh': torch.tanh,
@@ -247,3 +152,142 @@ class PoolingLayer(nn.Module):
 
     def forward(self, x, mask):
         return self.pooling(x, mask)
+
+
+@MODELS.register_module(
+    Tasks.faq_question_answering, module_name=Models.structbert)
+class SbertForFaqQuestionAnswering(BaseTaskModel):
+    _backbone_prefix = ''
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model = cls(kwargs.get('model_dir'))
+        model.load_checkpoint(kwargs.get('model_dir'))
+        return model
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+        backbone_cfg = SbertConfig.from_pretrained(model_dir)
+        self.bert = SbertModel(backbone_cfg)
+
+        model_config = Config.from_file(
+            os.path.join(model_dir,
+                         ModelFile.CONFIGURATION)).get(ConfigFields.model, {})
+
+        metric = model_config.get('metric', 'cosine')
+        pooling_method = model_config.get('pooling', 'avg')
+
+        Arg = namedtuple('args', [
+            'metrics', 'proj_hidden_size', 'hidden_size', 'dropout', 'pooling'
+        ])
+        args = Arg(
+            metrics=metric,
+            proj_hidden_size=self.bert.config.hidden_size,
+            hidden_size=self.bert.config.hidden_size,
+            dropout=0.0,
+            pooling=pooling_method)
+
+        self.metrics_layer = MetricsLayer(args)
+        self.pooling = PoolingLayer(args)
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data, it contains the following keys:
+                query(:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                    The query to be predicted.
+                support(:obj:`torch.LongTensor` of shape :obj:`(support_size, sequence_length)`):
+                    The support set.
+                support_label(:obj:`torch.LongTensor` of shape :obj:`(support_size, )`):
+                    The labels of support set.
+
+        Returns:
+            Dict[str, Tensor]: result, it contains the following key:
+                scores(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_cls)`):
+                    Predicted scores of all classes for each query.
+        Examples:
+            >>> from modelscope.hub.snapshot_download import snapshot_download
+            >>> from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+            >>> from modelscope.models.nlp import SbertForFaqQuestionAnswering
+            >>> cache_path = snapshot_download('damo/nlp_structbert_faq-question-answering_chinese-base')
+            >>> preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(cache_path)
+            >>> model = SbertForFaqQuestionAnswering.from_pretrained(cache_path)
+            >>> param = {
+            >>>            'query_set': ['如何使用优惠券', '在哪里领券', '在哪里领券'],
+            >>>            'support_set': [{
+            >>>                    'text': '卖品代金券怎么用',
+            >>>                    'label': '6527856'
+            >>>               }, {
+            >>>                    'text': '怎么使用优惠券',
+            >>>                    'label': '6527856'
+            >>>                }, {
+            >>>                    'text': '这个可以一起领吗',
+            >>>                    'label': '1000012000'
+            >>>                }, {
+            >>>                    'text': '付款时送的优惠券哪里领',
+            >>>                    'label': '1000012000'
+            >>>                }, {
+            >>>                    'text': '购物等级怎么长',
+            >>>                    'label': '13421097'
+            >>>                }, {
+            >>>                    'text': '购物等级二心',
+            >>>                    'label': '13421097'
+            >>>               }]
+            >>>           }
+            >>> result = model(preprocessor(param))
+        """
+        assert not self.training
+        query = input['query']
+        support = input['support']
+        if isinstance(query, list):
+            query = torch.stack(query)
+        if isinstance(support, list):
+            support = torch.stack(support)
+        n_query = query.shape[0]
+        n_support = support.shape[0]
+        query_mask = torch.ne(query, 0).view([n_query, -1])
+        support_mask = torch.ne(support, 0).view([n_support, -1])
+
+        support_labels = input['support_labels']
+        num_cls = torch.max(support_labels) + 1
+        onehot_labels = self._get_onehot_labels(support_labels, n_support,
+                                                num_cls)
+
+        input_ids = torch.cat([query, support])
+        input_mask = torch.cat([query_mask, support_mask], dim=0)
+        pooled_representation = self.forward_sentence_embedding({
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            input_mask
+        })
+        z_query = pooled_representation[:n_query]
+        z_support = pooled_representation[n_query:]
+        cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
+        protos = torch.matmul(onehot_labels.transpose(0, 1),
+                              z_support) / cls_n_support.unsqueeze(-1)
+        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
+        if self.metrics_layer.name == 'relation':
+            scores = torch.sigmoid(scores)
+        return {'scores': scores}
+
+    def _get_onehot_labels(self, labels, support_size, num_cls):
+        labels_ = labels.view(support_size, 1)
+        target_oh = torch.zeros(support_size, num_cls).to(labels)
+        target_oh.scatter_(dim=1, index=labels_, value=1)
+        return target_oh.view(support_size, num_cls).float()
+
+    def forward_sentence_embedding(self, inputs: Dict[str, Tensor]):
+        input_ids = inputs['input_ids']
+        input_mask = inputs['attention_mask']
+        if not isinstance(input_ids, Tensor):
+            input_ids = torch.IntTensor(input_ids)
+        if not isinstance(input_mask, Tensor):
+            input_mask = torch.IntTensor(input_mask)
+        rst = self.bert(input_ids, input_mask)
+        last_hidden_states = rst.last_hidden_state
+        if len(input_mask.shape) == 2:
+            input_mask = input_mask.unsqueeze(-1)
+        pooled_representation = self.pooling(last_hidden_states, input_mask)
+        return pooled_representation
diff --git a/modelscope/models/nlp/structbert/fill_mask.py b/modelscope/models/nlp/structbert/fill_mask.py
new file mode 100644
index 00000000..e611aa88
--- /dev/null
+++ b/modelscope/models/nlp/structbert/fill_mask.py
@@ -0,0 +1,284 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import SbertModel, SbertPreTrainedModel
+from .configuration import SbertConfig
+
+logger = logging.get_logger(__name__)
+
+
+class SbertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class SbertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = SbertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class SbertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class SbertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = SbertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
+class SbertForMaskedLM(SbertPreTrainedModel):
+    r"""StructBERT Model with a `language modeling` head on top.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: SbertConfig, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = SbertModel(config)
+        self.cls = SbertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor, NLPPreprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_fill-mask_chinese-large')
+            >>> preprocessor = NLPPreprocessor('damo/nlp_structbert_fill-mask_chinese-large')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可[MASK]不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可[MASK]不动我。'))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:-1]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
+        attention_mask_zero = attention_mask.new_zeros(
+            (attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, attention_mask_zero],
+                                   dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/modelscope/models/nlp/structbert/modeling_sbert.py b/modelscope/models/nlp/structbert/modeling_sbert.py
deleted file mode 100755
index e789037a..00000000
--- a/modelscope/models/nlp/structbert/modeling_sbert.py
+++ /dev/null
@@ -1,1963 +0,0 @@
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch SBERT model. mainly copied from :module:`~transformers.modeling_bert`"""
-
-import math
-import warnings
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.utils.checkpoint
-from packaging import version
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
-                                     add_start_docstrings,
-                                     add_start_docstrings_to_model_forward,
-                                     replace_return_docstrings)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
-    MultipleChoiceModelOutput, NextSentencePredictorOutput,
-    QuestionAnsweringModelOutput, SequenceClassifierOutput,
-    TokenClassifierOutput)
-from transformers.modeling_utils import (PreTrainedModel,
-                                         apply_chunking_to_forward,
-                                         find_pruneable_heads_and_indices,
-                                         prune_linear_layer)
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import BACKBONES
-from modelscope.utils.constant import Fields
-from modelscope.utils.logger import get_logger
-from .adv_utils import compute_adv_loss, compute_adv_loss_pair
-from .configuration_sbert import SbertConfig
-
-logger = get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = 'nlp_structbert_backbone_base_std'
-_CONFIG_FOR_DOC = 'SbertConfig'
-_TOKENIZER_FOR_DOC = 'SbertTokenizer'
-
-
-class SbertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size,
-            config.hidden_size,
-            padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        self.register_buffer(
-            'position_ids',
-            torch.arange(config.max_position_embeddings).expand((1, -1)))
-        if version.parse(torch.__version__) > version.parse('1.6.0'):
-            self.register_buffer(
-                'token_type_ids',
-                torch.zeros(
-                    self.position_ids.size(),
-                    dtype=torch.long,
-                    device=self.position_ids.device),
-                persistent=False,
-            )
-
-    def forward(self,
-                input_ids=None,
-                token_type_ids=None,
-                position_ids=None,
-                inputs_embeds=None,
-                past_key_values_length=0,
-                return_inputs_embeds=False):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:,
-                                             past_key_values_length:seq_length
-                                             + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users
-        # when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, 'token_type_ids'):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape,
-                    dtype=torch.long,
-                    device=self.position_ids.device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == 'absolute':
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        if not return_inputs_embeds:
-            return embeddings
-        else:
-            return embeddings, inputs_embeds
-
-
-class SbertSelfAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
-                config, 'embedding_size'):
-            raise ValueError(
-                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
-                f'heads ({config.num_attention_heads})')
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size
-                                       / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config,
-                                               'position_embedding_type',
-                                               'absolute')
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(
-                2 * config.max_position_embeddings - 1,
-                self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(
-                self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(
-                self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(
-                seq_length, dtype=torch.long,
-                device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(
-                distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(
-                dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == 'relative_key':
-                relative_position_scores = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == 'relative_key_query':
-                relative_position_scores_query = torch.einsum(
-                    'bhld,lrd->bhlr', query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum(
-                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in SbertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer,
-                   attention_probs) if output_attentions else (context_layer, )
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value, )
-        return outputs
-
-
-class SbertSelfOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.self = SbertSelfAttention(config)
-        self.output = SbertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads,
-            self.self.attention_head_size, self.pruned_heads)
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(
-            heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,
-                   ) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class SbertIntermediate(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class SbertOutput(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class SbertLayer(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = SbertAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            if not self.is_decoder:
-                raise ValueError(
-                    f'{self} should be used as a decoder model if cross attention is added'
-                )
-            self.crossattention = SbertAttention(config)
-        self.intermediate = SbertIntermediate(config)
-        self.output = SbertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:
-                                                  2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[
-                1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            if not hasattr(self, 'crossattention'):
-                raise ValueError(
-                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention '
-                    f'layers by setting `config.add_cross_attention=True`')
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[
-                -2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[
-                1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
-                                                 self.chunk_size_feed_forward,
-                                                 self.seq_len_dim,
-                                                 attention_output)
-        outputs = (layer_output, ) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value, )
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class SbertEncoder(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList(
-            [SbertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-        ) if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states, )
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[
-                i] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value,
-                                      output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1], )
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    layer_outputs[1], )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        layer_outputs[2], )
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(v for v in [
-                hidden_states,
-                next_decoder_cache,
-                all_hidden_states,
-                all_self_attentions,
-                all_cross_attentions,
-            ] if v is not None)
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-class SbertPooler(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class SbertPredictionHeadTransform(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = nn.LayerNorm(
-            config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class SbertLMPredictionHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.transform = SbertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class SbertOnlyMLMHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = SbertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class SbertOnlyNSPHead(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class SbertPreTrainingHeads(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = SbertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class SbertPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = SbertConfig
-    base_model_prefix = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(
-                mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, SbertEncoder):
-            module.gradient_checkpointing = value
-
-
-@dataclass
-class SbertForPreTrainingOutput(ModelOutput):
-    """
-    Output type of :class:`~transformers.BertForPreTraining`.
-
-    Args:
-        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction
-            (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``
-            is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True``
-            is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-SBERT_START_DOCSTRING = r"""
-
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
-            all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
-"""
-
-SBERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
-"""
-
-
-@dataclass
-class BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-        BaseModelOutputWithPoolingAndCrossAttentions):
-    embedding_output: torch.FloatTensor = None
-    logits: Optional[Union[tuple, torch.FloatTensor]] = None
-    kwargs: dict = None
-
-
-@add_start_docstrings(
-    'The Sbert Model transformer outputting raw hidden-states without any specific head on top.',
-    SBERT_START_DOCSTRING,
-)
-class SbertModel(SbertPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
-    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    input to the forward pass.
-    """
-
-    def __init__(self, config: SbertConfig, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = SbertEmbeddings(config)
-        self.encoder = SbertEncoder(config)
-
-        self.pooler = SbertPooler(config) if add_pooling_layer else None
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                past_key_values=None,
-                use_cache=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple
-            having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else
-            self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time'
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError(
-                'You have to specify either input_ids or inputs_embeds')
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[
-            2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                ((batch_size, seq_length + past_key_values_length)),
-                device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, 'token_type_ids'):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
-                                                                         seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
-                    batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(
-                    input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
-            )
-            encoder_hidden_shape = (encoder_batch_size,
-                                    encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(
-                    encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(
-                encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask,
-                                       self.config.num_hidden_layers)
-
-        embedding_output, orignal_embeds = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-            return_inputs_embeds=True,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(
-            sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output,
-                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
-
-        return BaseModelOutputWithPoolingAndCrossAttentionsWithEmbedding(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-            embedding_output=orignal_embeds)
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
-    sentence prediction (classification)` head.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForPreTraining(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        self.bert = SbertModel(config)
-        self.cls = SbertPreTrainingHeads(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=SbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        next_sentence_label=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
-
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(
-            sequence_output, pooled_output)
-
-        total_loss = None
-        if labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-            next_sentence_loss = loss_fct(
-                seq_relationship_score.view(-1, 2),
-                next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-
-        if not return_dict:
-            output = (prediction_scores,
-                      seq_relationship_score) + outputs[2:-1]
-            return ((total_loss, )
-                    + output) if total_loss is not None else output
-
-        return SbertForPreTrainingOutput(
-            loss=total_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """Sbert Model with a `language modeling` head on top for CLM fine-tuning. """,
-    SBERT_START_DOCSTRING)
-class SbertLMHeadModel(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning(
-                'If you want to use `SbertLMHeadModel` as a standalone, add `is_decoder=True.`'
-            )
-
-        self.bert = SbertModel(config, add_pooling_layer=False)
-        self.cls = SbertOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
-            with each tuple having 4 tensors of
-            shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-
-        Returns:
-
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :
-                                                          -1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(
-                shifted_prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:-1]
-            return ((lm_loss, ) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past=None,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx)
-                for past_state in layer_past), )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """Sbert Model with a `language modeling` head on top. """,
-    SBERT_START_DOCSTRING)
-class SbertForMaskedLM(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-    _keys_to_ignore_on_load_missing = [
-        r'position_ids', r'predictions.decoder.bias'
-    ]
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                'If you want to use `SbertForMaskedLM` make sure `config.is_decoder=False` for '
-                'bi-directional self-attention.')
-
-        self.bert = SbertModel(config)
-        self.cls = SbertOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        """
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores, ) + outputs[2:-1]
-            return ((masked_lm_loss, )
-                    + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      attention_mask=None,
-                                      **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
-        attention_mask_zero = attention_mask.new_zeros(
-            (attention_mask.shape[0], 1))
-        attention_mask = torch.cat([attention_mask, attention_mask_zero],
-                                   dim=-1)
-        dummy_token = torch.full((effective_batch_size, 1),
-                                 self.config.pad_token_id,
-                                 dtype=torch.long,
-                                 device=input_ids.device)
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {'input_ids': input_ids, 'attention_mask': attention_mask}
-
-
-@add_start_docstrings(
-    """Sbert Model with a `next sentence prediction (classification)` head on top. """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForNextSentencePrediction(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-
-        self.bert = SbertModel(config)
-        self.cls = SbertOnlyNSPHead(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @replace_return_docstrings(
-        output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        **kwargs,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
-
-            - 0 indicates sequence B is a continuation of sequence A,
-            - 1 indicates sequence B is a random sequence.
-
-        Returns:
-
-        """
-
-        if 'next_sentence_label' in kwargs:
-            warnings.warn(
-                'The `next_sentence_label` argument is deprecated and will be removed '
-                'in a future version, use `labels` instead.',
-                FutureWarning,
-            )
-            labels = kwargs.pop('next_sentence_label')
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = outputs[1]
-
-        seq_relationship_scores = self.cls(pooled_output)
-
-        next_sentence_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(
-                seq_relationship_scores.view(-1, 2), labels.view(-1))
-
-        if not return_dict:
-            output = (seq_relationship_scores, ) + outputs[2:-1]
-            return ((next_sentence_loss, )
-                    + output) if next_sentence_loss is not None else output
-
-        return NextSentencePredictorOutput(
-            loss=next_sentence_loss,
-            logits=seq_relationship_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
-    output) e.g. for GLUE tasks.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForSequenceClassification(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        self.init_weights()
-
-    def _forward_call(self, **kwargs):
-        outputs = self.bert(**kwargs)
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        outputs['logits'] = logits
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-        return self.compute_loss(outputs, labels, **outputs.kwargs)
-
-    def compute_loss(self, outputs, labels, **kwargs):
-        logits = outputs.logits
-        embedding_output = outputs.embedding_output
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = 'regression'
-                elif self.num_labels > 1 and (labels.dtype == torch.long
-                                              or labels.dtype == torch.int):
-                    self.config.problem_type = 'single_label_classification'
-                else:
-                    self.config.problem_type = 'multi_label_classification'
-
-            if self.config.problem_type == 'regression':
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == 'single_label_classification':
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-                if self.config.adv_grad_factor is not None and self.training:
-                    loss = compute_adv_loss(
-                        embedding=embedding_output,
-                        model=self._forward_call,
-                        ori_logits=logits,
-                        ori_loss=loss,
-                        adv_bound=self.config.adv_bound,
-                        adv_grad_factor=self.config.adv_grad_factor,
-                        sigma=self.config.sigma,
-                        **kwargs)
-            elif self.config.problem_type == 'multi_label_classification':
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForMultipleChoice(SbertPreTrainedModel):
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.init_weights()
-
-    def _forward_call(self, num_choices, **kwargs):
-        outputs = self.bert(**kwargs)
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        outputs['logits'] = logits.view(-1, num_choices)
-        kwargs['num_choices'] = num_choices
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format(
-            'batch_size, num_choices, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MultipleChoiceModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            :obj:`input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-
-        num_choices = input_ids.shape[
-            1] if input_ids is not None else inputs_embeds.shape[1]
-
-        input_ids = input_ids.view(
-            -1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(
-            -1,
-            attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(
-            -1,
-            token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(
-            -1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2),
-                               inputs_embeds.size(-1))
-            if inputs_embeds is not None else None)
-
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            num_choices=num_choices)
-
-        reshaped_logits = outputs.logits
-        kwargs = outputs.kwargs
-        embedding_output = outputs.embedding_output
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            if self.config.adv_grad_factor is not None and self.training:
-                loss = compute_adv_loss(
-                    embedding=embedding_output,
-                    model=self._forward_call,
-                    ori_logits=reshaped_logits,
-                    ori_loss=loss,
-                    adv_bound=self.config.adv_bound,
-                    adv_grad_factor=self.config.adv_grad_factor,
-                    sigma=self.config.sigma,
-                    **kwargs)
-
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForTokenClassification(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config, add_pooling_layer=False)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def _forward_call(self, **kwargs):
-        outputs = self.bert(**kwargs)
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        outputs['logits'] = logits
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-
-        logits = outputs.logits
-        embedding_output = outputs.embedding_output
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1),
-                    torch.tensor(loss_fct.ignore_index).type_as(labels))
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(
-                    logits.view(-1, self.num_labels), labels.view(-1))
-            if self.config.adv_grad_factor is not None and self.training:
-                loss = compute_adv_loss(
-                    embedding=embedding_output,
-                    model=self._forward_call,
-                    ori_logits=logits,
-                    ori_loss=loss,
-                    adv_bound=self.config.adv_bound,
-                    adv_grad_factor=self.config.adv_grad_factor,
-                    sigma=self.config.sigma,
-                    with_attention_mask=attention_mask is not None,
-                    **outputs.kwargs)
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    Sbert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    SBERT_START_DOCSTRING,
-)
-class SbertForQuestionAnswering(SbertPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config: SbertConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        if self.config.adv_grad_factor is None:
-            logger.warning(
-                'Adv parameters not set, skipping compute_adv_loss.')
-        self.bert = SbertModel(config, add_pooling_layer=False)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def _forward_call(self, **kwargs):
-        outputs = self.bert(**kwargs)
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        outputs['logits'] = (start_logits, end_logits)
-        outputs.kwargs = kwargs
-        return outputs
-
-    @add_start_docstrings_to_model_forward(
-        SBERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=QuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if not return_dict:
-            logger.error('Return tuple in sbert is not supported now.')
-
-        outputs = self._forward_call(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict)
-        return self.compute_loss(outputs, start_positions, end_positions,
-                                 **outputs.kwargs)
-
-    def compute_loss(self,
-                     outputs,
-                     start_positions=None,
-                     end_positions=None,
-                     **kwargs):
-        start_logits, end_logits = outputs.logits
-        embedding_output = outputs.embedding_output
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            if self.config.adv_grad_factor is not None and self.training:
-                total_loss = compute_adv_loss_pair(
-                    embedding=embedding_output,
-                    model=self._forward_call,
-                    start_logits=start_logits,
-                    end_logits=end_logits,
-                    ori_loss=total_loss,
-                    adv_bound=self.config.adv_bound,
-                    adv_grad_factor=self.config.adv_grad_factor,
-                    sigma=self.config.sigma,
-                    **kwargs)
-
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
diff --git a/modelscope/models/nlp/structbert/text_classification.py b/modelscope/models/nlp/structbert/text_classification.py
new file mode 100644
index 00000000..044cf8d0
--- /dev/null
+++ b/modelscope/models/nlp/structbert/text_classification.py
@@ -0,0 +1,235 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .adv_utils import compute_adv_loss
+from .backbone import SbertModel, SbertPreTrainedModel
+from .configuration import SbertConfig
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(
+    Tasks.text_classification, module_name=Models.structbert)
+@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.sentence_similarity, module_name=Models.structbert)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForSequenceClassification(SbertPreTrainedModel):
+    r"""StructBERT Model transformer with a sequence classification/regression head on top
+    (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the text classification model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    def __init__(self, config: SbertConfig, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+
+        SbertForSequenceClassification.base_model_prefix = getattr(
+            config, 'base_model_prefix',
+            SbertForSequenceClassification.base_model_prefix)
+        setattr(self, self.base_model_prefix, SbertModel(config))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.base_model(**kwargs)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor(('这是个测试', '这也是个测试'))))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins(('这是个测试', '这也是个测试')))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        return self.compute_loss(outputs, labels, **outputs.kwargs)
+
+    def compute_loss(self, outputs, labels, **kwargs):
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+                if self.config.adv_grad_factor is not None and self.training:
+                    loss = compute_adv_loss(
+                        embedding=embedding_output,
+                        model=self._forward_call,
+                        ori_logits=logits,
+                        ori_loss=loss,
+                        adv_bound=self.config.adv_bound,
+                        adv_grad_factor=self.config.adv_grad_factor,
+                        sigma=self.config.sigma,
+                        **kwargs)
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py
new file mode 100644
index 00000000..a040ff3e
--- /dev/null
+++ b/modelscope/models/nlp/structbert/token_classification.py
@@ -0,0 +1,229 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .adv_utils import compute_adv_loss
+from .backbone import SbertModel, SbertPreTrainedModel
+from .configuration import SbertConfig
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(
+    Tasks.token_classification, module_name=Models.structbert)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
+class SbertForTokenClassification(SbertPreTrainedModel):
+    r"""StructBERT Model with a token classification head on top (a linear layer on top of the hidden-states output)
+    e.g. for Named-Entity-Recognition (NER) tasks.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the token-classification model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.TokenClassificationPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in modelscope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config: SbertConfig, **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        if self.config.adv_grad_factor is None:
+            logger.warning(
+                'Adv parameters not set, skipping compute_adv_loss.')
+        setattr(self, self.base_model_prefix,
+                SbertModel(config, add_pooling_layer=False))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def _forward_call(self, **kwargs):
+        outputs = self.bert(**kwargs)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs['logits'] = logits
+        outputs.kwargs = kwargs
+        return outputs
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.TokenClassifierOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if not return_dict:
+            logger.error('Return tuple in sbert is not supported now.')
+
+        outputs = self._forward_call(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+
+        logits = outputs.logits
+        embedding_output = outputs.embedding_output
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            if self.config.adv_grad_factor is not None and self.training:
+                loss = compute_adv_loss(
+                    embedding=embedding_output,
+                    model=self._forward_call,
+                    ori_logits=logits,
+                    ori_loss=loss,
+                    adv_bound=self.config.adv_bound,
+                    adv_grad_factor=self.config.adv_grad_factor,
+                    sigma=self.config.sigma,
+                    with_attention_mask=attention_mask is not None,
+                    **outputs.kwargs)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=offset_mapping,
+        )
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization.py
similarity index 100%
rename from modelscope/models/nlp/structbert/tokenization_sbert.py
rename to modelscope/models/nlp/structbert/tokenization.py
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_fast.py
similarity index 99%
rename from modelscope/models/nlp/structbert/tokenization_sbert_fast.py
rename to modelscope/models/nlp/structbert/tokenization_fast.py
index a0a81121..6f7b7ba7 100644
--- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
+++ b/modelscope/models/nlp/structbert/tokenization_fast.py
@@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
-from .tokenization_sbert import SbertTokenizer
+from .tokenization import SbertTokenizer
 
 logger = get_logger(__name__)
 
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index 38359044..e733efe2 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -7,6 +7,9 @@ if TYPE_CHECKING:
     from .information_extraction import InformationExtractionModel
     from .feature_extraction import FeatureExtractionModel
     from .fill_mask import FillMaskModel
+    from .nncrf_for_named_entity_recognition import (
+        TransformerCRFForNamedEntityRecognition,
+        LSTMCRFForNamedEntityRecognition)
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
@@ -17,6 +20,10 @@ else:
         'information_extraction': ['InformationExtractionModel'],
         'feature_extraction': ['FeatureExtractionModel'],
         'fill_mask': ['FillMaskModel'],
+        'nncrf_for_named_entity_recognition': [
+            'TransformerCRFForNamedEntityRecognition',
+            'LSTMCRFForNamedEntityRecognition'
+        ],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
diff --git a/modelscope/models/nlp/task_models/feature_extraction.py b/modelscope/models/nlp/task_models/feature_extraction.py
index 069c37aa..9360ec08 100644
--- a/modelscope/models/nlp/task_models/feature_extraction.py
+++ b/modelscope/models/nlp/task_models/feature_extraction.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
@@ -31,13 +32,8 @@ class FeatureExtractionModel(SingleBackboneTaskModelBase):
         self.build_backbone(self.backbone_cfg)
 
     def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-
         # backbone do not need labels, only head need for loss compute
-        labels = input.pop(OutputKeys.LABELS, None)
-
+        input.pop(OutputKeys.LABELS, None)
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
-        if labels is not None:
-            input[OutputKeys.LABELS] = labels
-
+        sequence_output = outputs.last_hidden_state
         return {OutputKeys.TEXT_EMBEDDING: sequence_output}
diff --git a/modelscope/models/nlp/task_models/fill_mask.py b/modelscope/models/nlp/task_models/fill_mask.py
index f7ef1cc2..0f7d3345 100644
--- a/modelscope/models/nlp/task_models/fill_mask.py
+++ b/modelscope/models/nlp/task_models/fill_mask.py
@@ -1,3 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
 import numpy as np
@@ -36,7 +37,7 @@ class FillMaskModel(SingleBackboneTaskModelBase):
         labels = input.pop(OutputKeys.LABELS, None)
 
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        sequence_output = outputs.last_hidden_state
         outputs = self.head.forward(sequence_output)
 
         if labels is not None:
diff --git a/modelscope/models/nlp/task_models/information_extraction.py b/modelscope/models/nlp/task_models/information_extraction.py
index a206c2fc..ce0e21a3 100644
--- a/modelscope/models/nlp/task_models/information_extraction.py
+++ b/modelscope/models/nlp/task_models/information_extraction.py
@@ -33,7 +33,7 @@ class InformationExtractionModel(SingleBackboneTaskModelBase):
 
     def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]:
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        sequence_output = outputs.last_hidden_state
         outputs = self.head.forward(sequence_output, input['text'],
                                     input['offsets'])
         return {OutputKeys.SPO_LIST: outputs}
diff --git a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
similarity index 83%
rename from modelscope/models/nlp/nncrf_for_named_entity_recognition.py
rename to modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
index 8b0c59b2..017e35e5 100644
--- a/modelscope/models/nlp/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
@@ -12,6 +12,7 @@ from transformers import AutoConfig, AutoModel
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierWithPredictionsOutput
 from modelscope.utils.constant import ModelFile, Tasks
 
 __all__ = [
@@ -39,28 +40,116 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
     def eval(self):
         return self.model.eval()
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+    ) -> Dict[str, Any]:
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.TokenClassifierOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
         input_tensor = {
-            'input_ids': input['input_ids'],
-            'attention_mask': input['attention_mask'],
-            'label_mask': input['label_mask'],
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
         }
         output = {
-            'text': input['text'],
-            'offset_mapping': input['offset_mapping'],
+            'offset_mapping': offset_mapping,
             **input_tensor,
             **self.model(input_tensor)
         }
         return output
 
-    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+    def postprocess(self, input: Dict[str, Any], **kwargs):
         predicts = self.model.decode(input)
-        output = {
-            'text': input['text'],
-            'offset_mapping': input['offset_mapping'],
-            'predicts': predicts['predicts'].squeeze(0).cpu().numpy(),
-        }
-        return output
+        offset_len = len(input['offset_mapping'])
+        predictions = torch.narrow(
+            predicts, 1, 0,
+            offset_len)  # index_select only move loc, not resize
+        return TokenClassifierWithPredictionsOutput(
+            loss=None,
+            logits=None,
+            hidden_states=None,
+            attentions=None,
+            offset_mapping=input['offset_mapping'],
+            predictions=predictions,
+        )
 
 
 @MODELS.register_module(
@@ -133,8 +222,7 @@ class TransformerCRF(nn.Module):
             inputs['label_mask'].shape[1],
             device=seq_lens.device)[None, :] < seq_lens[:, None]
         predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
-        outputs = {'predicts': predicts}
-        return outputs
+        return predicts
 
 
 class LSTMCRF(nn.Module):
@@ -183,8 +271,7 @@ class LSTMCRF(nn.Module):
             inputs['label_mask'].shape[1],
             device=seq_lens.device)[None, :] < seq_lens[:, None]
         predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
-        outputs = {'predicts': predicts}
-        return outputs
+        return predicts
 
 
 class CRF(nn.Module):
diff --git a/modelscope/models/nlp/task_models/sequence_classification.py b/modelscope/models/nlp/task_models/sequence_classification.py
index 1f5e46c3..6c0c09a2 100644
--- a/modelscope/models/nlp/task_models/sequence_classification.py
+++ b/modelscope/models/nlp/task_models/sequence_classification.py
@@ -1,8 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from typing import Any, Dict
 
-import json
 import numpy as np
 
 from modelscope.metainfo import TaskModels
@@ -16,11 +14,6 @@ from modelscope.utils.hub import parse_label_mapping
 __all__ = ['SequenceClassificationModel']
 
 
-@MODELS.register_module(
-    Tasks.sentence_similarity, module_name=TaskModels.text_classification)
-@MODELS.register_module(Tasks.nli, module_name=TaskModels.text_classification)
-@MODELS.register_module(
-    Tasks.sentiment_classification, module_name=TaskModels.text_classification)
 @MODELS.register_module(
     Tasks.text_classification, module_name=TaskModels.text_classification)
 class SequenceClassificationModel(SingleBackboneTaskModelBase):
@@ -54,25 +47,10 @@ class SequenceClassificationModel(SingleBackboneTaskModelBase):
         labels = input.pop(OutputKeys.LABELS, None)
 
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
+        pooled_output = outputs.pooler_output
         outputs = self.head.forward(pooled_output)
         if labels is not None:
             input[OutputKeys.LABELS] = labels
             loss = self.compute_loss(outputs, labels)
             outputs.update(loss)
         return outputs
-
-    def extract_logits(self, outputs):
-        return outputs[OutputKeys.LOGITS].cpu().detach()
-
-    def postprocess(self, input, **kwargs):
-        logits = self.extract_logits(input)
-        probs = logits.softmax(-1).numpy()
-        pred = logits.argmax(-1).numpy()
-        logits = logits.numpy()
-        res = {
-            OutputKeys.PREDICTIONS: pred,
-            OutputKeys.PROBABILITIES: probs,
-            OutputKeys.LOGITS: logits
-        }
-        return res
diff --git a/modelscope/models/nlp/task_models/task_model.py b/modelscope/models/nlp/task_models/task_model.py
index 0b43044f..8c83517a 100644
--- a/modelscope/models/nlp/task_models/task_model.py
+++ b/modelscope/models/nlp/task_models/task_model.py
@@ -404,7 +404,7 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
     def build_backbone(self, cfg):
         if 'prefix' in cfg:
             self._backbone_prefix = cfg['prefix']
-        backbone = build_backbone(cfg, field=Fields.nlp)
+        backbone = build_backbone(cfg)
         setattr(self, cfg['prefix'], backbone)
 
     def build_head(self, cfg):
@@ -414,7 +414,7 @@ class SingleBackboneTaskModelBase(BaseTaskModel):
             )
         if 'prefix' in cfg:
             self._head_prefix = cfg['prefix']
-        head = build_head(cfg, group_key=self.group_key)
+        head = build_head(cfg, task_name=self.group_key)
         setattr(self, self._head_prefix, head)
         return head
 
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index a39f58bf..2739bf11 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -8,7 +8,7 @@ from modelscope.metainfo import TaskModels
 from modelscope.models.builder import MODELS
 from modelscope.models.nlp.task_models.task_model import \
     SingleBackboneTaskModelBase
-from modelscope.outputs import OutputKeys
+from modelscope.outputs import OutputKeys, TokenClassifierOutput
 from modelscope.utils.constant import Tasks
 from modelscope.utils.hub import parse_label_mapping
 from modelscope.utils.tensor_utils import (torch_nested_detach,
@@ -53,27 +53,20 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
             labels = input.pop(OutputKeys.LABELS)
 
         outputs = super().forward(input)
-        sequence_output, pooled_output = self.extract_backbone_outputs(outputs)
-        outputs = self.head.forward(sequence_output)
+        sequence_output = outputs[0]
+        logits = self.head.forward(sequence_output)
+        loss = None
         if labels in input:
             loss = self.compute_loss(outputs, labels)
-            outputs.update(loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=input['offset_mapping'],
+        )
         return outputs
 
     def extract_logits(self, outputs):
         return outputs[OutputKeys.LOGITS].cpu().detach()
-
-    def extract_backbone_outputs(self, outputs):
-        sequence_output = None
-        pooled_output = None
-        if hasattr(self.backbone, 'extract_sequence_outputs'):
-            sequence_output = self.backbone.extract_sequence_outputs(outputs)
-        return sequence_output, pooled_output
-
-    def postprocess(self, input, **kwargs):
-        logits = self.extract_logits(input)
-        pred = torch.argmax(logits[0], dim=-1)
-        pred = torch_nested_numpify(torch_nested_detach(pred))
-        logits = torch_nested_numpify(torch_nested_detach(logits))
-        res = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
-        return res
diff --git a/modelscope/models/nlp/text_ranking.py b/modelscope/models/nlp/text_ranking.py
deleted file mode 100644
index 5bc0635a..00000000
--- a/modelscope/models/nlp/text_ranking.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Any, Dict
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp import SbertForSequenceClassification
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-
-__all__ = ['TextRanking']
-
-
-@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
-class TextRanking(SbertForSequenceClassification, SbertPreTrainedModel):
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir, *args, **kwargs):
-        if hasattr(config, 'base_model_prefix'):
-            TextRanking.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-        self.train_batch_size = kwargs.get('train_batch_size', 4)
-        self.register_buffer(
-            'target_label',
-            torch.zeros(self.train_batch_size, dtype=torch.long))
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=True)
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        outputs = self.base_model.forward(**input)
-
-        # backbone model should return pooled_output as its second output
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        if self.base_model.training:
-            scores = logits.view(self.train_batch_size, -1)
-            loss_fct = torch.nn.CrossEntropyLoss()
-            loss = loss_fct(scores, self.target_label)
-            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
-        return {OutputKeys.LOGITS: logits}
-
-    def sigmoid(self, logits):
-        return np.exp(logits) / (1 + np.exp(logits))
-
-    def postprocess(self, inputs: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        logits = inputs['logits'].squeeze(-1).detach().cpu().numpy()
-        logits = self.sigmoid(logits).tolist()
-        result = {OutputKeys.SCORES: logits}
-        return result
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (1 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-
-        num_labels = kwargs.get('num_labels', 1)
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-
-        return super(SbertPreTrainedModel, TextRanking).from_pretrained(
-            pretrained_model_name_or_path=kwargs.get('model_dir'),
-            model_dir=kwargs.get('model_dir'),
-            **model_args)
diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py
deleted file mode 100644
index e58967a5..00000000
--- a/modelscope/models/nlp/token_classification.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from abc import abstractmethod
-from typing import Dict
-
-import numpy as np
-import torch
-from torch import nn
-
-from modelscope.metainfo import Models
-from modelscope.models.base import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.nlp.bert import BertPreTrainedModel
-from modelscope.models.nlp.structbert import SbertPreTrainedModel
-from modelscope.outputs import OutputKeys
-from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import parse_label_mapping
-from modelscope.utils.tensor_utils import (torch_nested_detach,
-                                           torch_nested_numpify)
-
-__all__ = ['SbertForTokenClassification']
-
-
-class TokenClassification(TorchModel):
-    """A token classification base class for all the fitted token classification models.
-    """
-
-    base_model_prefix: str = 'bert'
-
-    def __init__(self, config, model_dir):
-        super().__init__(model_dir)
-        self.num_labels = config.num_labels
-        self.config = config
-        setattr(self, self.base_model_prefix, self.build_base_model())
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None
-            else config.hidden_dropout_prob)
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    @abstractmethod
-    def build_base_model(self):
-        """Build the backbone model.
-
-        Returns: the backbone instance.
-        """
-        pass
-
-    @property
-    def base_model(self):
-        return getattr(self, self.base_model_prefix)
-
-    def compute_loss(self, logits, labels, **kwargs):
-        """Compute loss.
-
-        For example, if backbone is pretrained model, there will be a 'attention_mask' parameter to skip
-        useless tokens.
-
-        Args:
-            logits: The logits from the classifier
-            labels: The labels
-            **kwargs: Other input params.
-
-        Returns: The loss.
-
-        """
-        pass
-
-    def forward(self, **kwargs):
-        labels = None
-        if OutputKeys.LABEL in kwargs:
-            labels = kwargs.pop(OutputKeys.LABEL)
-        elif OutputKeys.LABELS in kwargs:
-            labels = kwargs.pop(OutputKeys.LABELS)
-
-        outputs = self.base_model(**kwargs)
-        # base model should return the sequence_output as its first output
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        if labels is not None:
-            loss = self.compute_loss(logits, labels, **kwargs)
-            return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
-        return {OutputKeys.LOGITS: logits}
-
-    def postprocess(self, input: Dict[str, np.ndarray],
-                    **kwargs) -> Dict[str, np.ndarray]:
-        logits = input[OutputKeys.LOGITS]
-        pred = torch.argmax(logits[0], dim=-1)
-        pred = torch_nested_numpify(torch_nested_detach(pred))
-        logits = torch_nested_numpify(torch_nested_detach(logits))
-        rst = {OutputKeys.PREDICTIONS: pred, OutputKeys.LOGITS: logits}
-        return rst
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
-@MODELS.register_module(Tasks.part_of_speech, module_name=Models.structbert)
-@MODELS.register_module(
-    Tasks.token_classification, module_name=Models.structbert)
-class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel):
-    """Sbert token classification model.
-
-    Inherited from TokenClassification.
-    """
-
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_unexpected = [r'pooler']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            SbertForTokenClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .structbert import SbertModel
-        return SbertModel(self.config, add_pooling_layer=False)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                labels=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            labels=labels)
-
-    def compute_loss(self, logits, labels, attention_mask=None, **kwargs):
-        """Compute the loss with an attention mask.
-
-        @param logits: The logits output from the classifier.
-        @param labels: The labels.
-        @param attention_mask: The attention_mask.
-        @param kwargs: Unused input args.
-        @return: The loss
-        """
-        loss_fct = nn.CrossEntropyLoss()
-        # Only keep active parts of the loss
-        if attention_mask is not None:
-            active_loss = attention_mask.view(-1) == 1
-            active_logits = logits.view(-1, self.num_labels)
-            active_labels = torch.where(
-                active_loss, labels.view(-1),
-                torch.tensor(loss_fct.ignore_index).type_as(labels))
-            return loss_fct(active_logits, active_labels)
-        else:
-            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(SbertPreTrainedModel,
-                     SbertForTokenClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
-
-
-@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
-@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
-class BertForTokenClassification(TokenClassification, BertPreTrainedModel):
-    """Bert token classification model.
-
-        Inherited from TokenClassificationBase.
-    """
-    base_model_prefix: str = 'bert'
-    supports_gradient_checkpointing = True
-    _keys_to_ignore_on_load_missing = [r'position_ids']
-
-    def __init__(self, config, model_dir):
-        if hasattr(config, 'base_model_prefix'):
-            BertForTokenClassification.base_model_prefix = config.base_model_prefix
-        super().__init__(config, model_dir)
-
-    def build_base_model(self):
-        from .bert import BertModel
-        return BertModel(self.config, add_pooling_layer=True)
-
-    def forward(self,
-                input_ids=None,
-                attention_mask=None,
-                token_type_ids=None,
-                position_ids=None,
-                head_mask=None,
-                inputs_embeds=None,
-                labels=None,
-                output_attentions=None,
-                output_hidden_states=None,
-                return_dict=None,
-                **kwargs):
-        return super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            **kwargs)
-
-    @classmethod
-    def _instantiate(cls, **kwargs):
-        """Instantiate the model.
-
-        @param kwargs: Input args.
-                    model_dir: The model dir used to load the checkpoint and the label information.
-                    num_labels: An optional arg to tell the model how many classes to initialize.
-                                    Method will call utils.parse_label_mapping if num_labels not supplied.
-                                    If num_labels is not found, the model will use the default setting (2 classes).
-        @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
-        """
-        model_dir = kwargs.get('model_dir')
-        num_labels = kwargs.get('num_labels')
-        if num_labels is None:
-            label2id = parse_label_mapping(model_dir)
-            if label2id is not None and len(label2id) > 0:
-                num_labels = len(label2id)
-
-        model_args = {} if num_labels is None else {'num_labels': num_labels}
-        return super(BertPreTrainedModel,
-                     BertForTokenClassification).from_pretrained(
-                         pretrained_model_name_or_path=kwargs.get('model_dir'),
-                         model_dir=kwargs.get('model_dir'),
-                         **model_args)
diff --git a/modelscope/models/nlp/veco/__init__.py b/modelscope/models/nlp/veco/__init__.py
index 0fe786fd..0774e9b4 100644
--- a/modelscope/models/nlp/veco/__init__.py
+++ b/modelscope/models/nlp/veco/__init__.py
@@ -18,18 +18,22 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .configuration_veco import VecoConfig
-    from .modeling_veco import (VecoForMaskedLM, VecoForSequenceClassification,
-                                VecoModel)
-    from .tokenization_veco import VecoTokenizer
-    from .tokenization_veco_fast import VecoTokenizerFast
+    from .configuration import VecoConfig
+    from .backbone import VecoModel
+    from .text_classification import VecoForSequenceClassification
+    from .token_classification import VecoForTokenClassification
+    from .fill_mask import VecoForMaskedLM
+    from .tokenization import VecoTokenizer
+    from .tokenization_fast import VecoTokenizerFast
 else:
     _import_structure = {
-        'configuration_veco': ['VecoConfig'],
-        'modeling_veco':
-        ['VecoForMaskedLM', 'VecoForSequenceClassification', 'VecoModel'],
-        'tokenization_veco': ['VecoTokenizer'],
-        'tokenization_veco_fast': ['VecoTokenizerFast'],
+        'configuration': ['VecoConfig'],
+        'backbone': ['VecoModel'],
+        'text_classification': ['VecoForSequenceClassification'],
+        'fill_mask': ['VecoForMaskedLM'],
+        'token_classification': ['VecoForTokenClassification'],
+        'tokenization': ['VecoTokenizer'],
+        'tokenization_fast': ['VecoTokenizerFast'],
     }
 
     import sys
diff --git a/modelscope/models/nlp/veco/backbone.py b/modelscope/models/nlp/veco/backbone.py
new file mode 100644
index 00000000..98d8c30a
--- /dev/null
+++ b/modelscope/models/nlp/veco/backbone.py
@@ -0,0 +1,96 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
+
+from transformers import RobertaModel
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .configuration import VecoConfig
+
+logger = logging.get_logger(__name__)
+
+VECO_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.veco)
+class VecoModel(TorchModel, RobertaModel):
+    """The bare Veco Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns:
+            Returns `modelscope.outputs.AttentionBackboneModelOutputWithEmbedding`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large', task='backbone')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large')
+            >>> print(model(**preprocessor('这是个测试')))
+
+        """
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionBackboneModelOutput(
+            last_hidden_state=outputs.last_hidden_state,
+            pooler_output=outputs.pooler_output,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = VecoConfig(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
diff --git a/modelscope/models/nlp/veco/configuration_veco.py b/modelscope/models/nlp/veco/configuration.py
similarity index 100%
rename from modelscope/models/nlp/veco/configuration_veco.py
rename to modelscope/models/nlp/veco/configuration.py
diff --git a/modelscope/models/nlp/veco/fill_mask.py b/modelscope/models/nlp/veco/fill_mask.py
new file mode 100644
index 00000000..de2cdb4a
--- /dev/null
+++ b/modelscope/models/nlp/veco/fill_mask.py
@@ -0,0 +1,99 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import RobertaForMaskedLM
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils.constant import Tasks
+from .configuration import VecoConfig
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
+class VecoForMaskedLM(TorchModel, RobertaForMaskedLM):
+    """Veco Model transformer with a masked language model head on top (a linear layer on top of the
+    pooled output).
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of StructBERT, the preprocessor of this model
+        is `modelscope.preprocessors.NLPPreprocessor`.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large')
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('你师父差得动你，你师父可<mask>不动我。')))
+            >>> # Call the pipeline
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('你师父差得动你，你师父可<mask>不动我。'))
+        """
+
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionFillMaskModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=kwargs['input_ids'],
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            ponet_config = VecoConfig(**kwargs)
+            model = cls(ponet_config)
+        else:
+            model = super(
+                Model,
+                cls).from_pretrained(pretrained_model_name_or_path=model_dir)
+        return model
diff --git a/modelscope/models/nlp/veco/modeling_veco.py b/modelscope/models/nlp/veco/modeling_veco.py
deleted file mode 100644
index b519c236..00000000
--- a/modelscope/models/nlp/veco/modeling_veco.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.
-# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Veco model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
-
-from transformers import (RobertaForMaskedLM, RobertaForMultipleChoice,
-                          RobertaForQuestionAnswering,
-                          RobertaForSequenceClassification,
-                          RobertaForTokenClassification, RobertaModel)
-from transformers.file_utils import add_start_docstrings
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import BACKBONES
-from modelscope.utils import logger as logging
-from modelscope.utils.constant import Fields
-from .configuration_veco import VecoConfig
-
-logger = logging.get_logger(__name__)
-
-VECO_PRETRAINED_MODEL_ARCHIVE_LIST = []
-
-VECO_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config ([`VecoConfig`]): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
-"""
-
-
-@add_start_docstrings(
-    'The bare Veco Model transformer outputting raw hidden-states without any specific head on top.',
-    VECO_START_DOCSTRING,
-)
-class VecoModel(RobertaModel):
-    """
-    This class overrides [`RobertaModel`]. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForSequenceClassification(RobertaForSequenceClassification):
-    """
-    This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model transformer with a masked language model head on top (a linear layer on top of the
-    pooled output).
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForMaskedLM(RobertaForMaskedLM):
-    """
-    This class overrides [`RobertaForMaskedLM`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
-    a softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForMultipleChoice(RobertaForMultipleChoice):
-    """
-    This class overrides [`RobertaForMultipleChoice`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
-    for Named-Entity-Recognition (NER) tasks.
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForTokenClassification(RobertaForTokenClassification):
-    """
-    This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
-
-
-@add_start_docstrings(
-    """
-    Veco Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
-    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    VECO_START_DOCSTRING,
-)
-class VecoForQuestionAnswering(RobertaForQuestionAnswering):
-    """
-    This class overrides [`RobertaForQuestionAnswering`]. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = VecoConfig
diff --git a/modelscope/models/nlp/veco/text_classification.py b/modelscope/models/nlp/veco/text_classification.py
new file mode 100644
index 00000000..e4e74d8f
--- /dev/null
+++ b/modelscope/models/nlp/veco/text_classification.py
@@ -0,0 +1,150 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import RobertaForSequenceClassification
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from .configuration import VecoConfig
+
+
+@MODELS.register_module(Tasks.nli, module_name=Models.veco)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.veco)
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
+@MODELS.register_module(Tasks.text_classification, module_name=Models.veco)
+class VecoForSequenceClassification(TorchModel,
+                                    RobertaForSequenceClassification):
+    """Veco Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the text classification model of Veco, the preprocessor of this model
+        is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
+
+    Trainer:
+        This model should be trained by dataset which has mixed languages,
+        and evaluated by datasets of languages one by one.
+        For example, if the training dataset is xnli (which has sub datasets of multiple languages), then you
+        should mix the sub-datasets with the languages you want to train to one training dataset, and evaluate
+        the model one sub-dataset by one sub-dataset of different languages.
+        This procedure can be done by custom code. If you are using trainer of ModelScope,
+        the `VecoTrainer` is suggested to use to train this model. This trainer overrides the basic evaluation
+        loop, and will call the evaluation dataset one by one. Besides, this trainer will use the `VecoTaskDataset`
+        to mix the input datasets to one, you can check the API Doc for the details.
+
+        To check the complete example please
+        view the unittest `test_veco_xnli` in `tests.trainers.test_finetune_sequence_classification.py`
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaForSequenceClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        """
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_veco_fill-mask-large',
+            >>>                               task='text-classification', num_labels=2)
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_veco_fill-mask-large',
+            >>>                                             label2id={'0': 0, '1': 1})
+            >>> # Call the model, return some tensors
+            >>> print(model(**preprocessor('这是个测试')))
+            >>> # Call the pipeline, the result may be incorrect
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification', pipeline_name='text-classification',
+            >>>                         model=model, preprocessor=preprocessor)
+            >>> print(pipeline_ins('这是个测试'))
+        """
+
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionTextClassificationModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = VecoConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
diff --git a/modelscope/models/nlp/veco/token_classification.py b/modelscope/models/nlp/veco/token_classification.py
new file mode 100644
index 00000000..f6252209
--- /dev/null
+++ b/modelscope/models/nlp/veco/token_classification.py
@@ -0,0 +1,107 @@
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import RobertaForTokenClassification
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTokenClassificationModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.hub import parse_label_mapping
+from .configuration import VecoConfig
+
+
+@MODELS.register_module(Tasks.token_classification, module_name=Models.veco)
+class VecoForTokenClassification(TorchModel, RobertaForTokenClassification):
+    """Veco Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`VecoConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    This class overrides [`RobertaForTokenClassification`]. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = VecoConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def forward(self, *args, **kwargs):
+        kwargs['return_dict'] = True
+        outputs = super(Model, self).forward(*args, **kwargs)
+        return AttentionTokenClassificationModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels is not input.
+                    label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        if model_dir is None:
+            config = VecoConfig(**kwargs)
+            model = cls(config)
+        else:
+            model_kwargs = {}
+            label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
+            id2label = kwargs.get(
+                'id2label', None if label2id is None else
+                {id: label
+                 for label, id in label2id.items()})
+            if id2label is not None and label2id is None:
+                label2id = {label: id for id, label in id2label.items()}
+
+            num_labels = kwargs.get(
+                'num_labels', None if label2id is None else len(label2id))
+            if num_labels is not None:
+                model_kwargs['num_labels'] = num_labels
+            if label2id is not None:
+                model_kwargs['label2id'] = label2id
+            if id2label is not None:
+                model_kwargs['id2label'] = id2label
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_kwargs)
+        return model
diff --git a/modelscope/models/nlp/veco/tokenization_veco.py b/modelscope/models/nlp/veco/tokenization.py
similarity index 100%
rename from modelscope/models/nlp/veco/tokenization_veco.py
rename to modelscope/models/nlp/veco/tokenization.py
diff --git a/modelscope/models/nlp/veco/tokenization_veco_fast.py b/modelscope/models/nlp/veco/tokenization_fast.py
similarity index 99%
rename from modelscope/models/nlp/veco/tokenization_veco_fast.py
rename to modelscope/models/nlp/veco/tokenization_fast.py
index 3edae0e7..b41a5c3b 100644
--- a/modelscope/models/nlp/veco/tokenization_veco_fast.py
+++ b/modelscope/models/nlp/veco/tokenization_fast.py
@@ -27,7 +27,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 from modelscope.utils import logger as logging
 
 if is_sentencepiece_available():
-    from .tokenization_veco import VecoTokenizer
+    from .tokenization import VecoTokenizer
 else:
     VecoTokenizer = None
 
diff --git a/modelscope/msdatasets/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py
index 014e4faa..4d82b741 100644
--- a/modelscope/msdatasets/task_datasets/torch_base_dataset.py
+++ b/modelscope/msdatasets/task_datasets/torch_base_dataset.py
@@ -19,6 +19,7 @@ class TorchTaskDataset(TaskDataset, Dataset):
                  preprocessor=None,
                  **kwargs):
         TaskDataset.__init__(self, datasets, mode, preprocessor, **kwargs)
+        self.trainer = None
 
     def __getitem__(self, index) -> Any:
         return self.prepare_sample(self._inner_dataset[index])
diff --git a/modelscope/outputs/__init__.py b/modelscope/outputs/__init__.py
new file mode 100644
index 00000000..47e66714
--- /dev/null
+++ b/modelscope/outputs/__init__.py
@@ -0,0 +1,2 @@
+from .nlp.model_outputs import *  # noqa
+from .outputs import TASK_OUTPUTS, ModelOutputBase, OutputKeys
diff --git a/modelscope/preprocessors/space_T_cn/fields/__init__.py b/modelscope/outputs/nlp/__init__.py
similarity index 100%
rename from modelscope/preprocessors/space_T_cn/fields/__init__.py
rename to modelscope/outputs/nlp/__init__.py
diff --git a/modelscope/outputs/nlp/model_outputs.py b/modelscope/outputs/nlp/model_outputs.py
new file mode 100644
index 00000000..dcb37145
--- /dev/null
+++ b/modelscope/outputs/nlp/model_outputs.py
@@ -0,0 +1,543 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+from modelscope.outputs.outputs import ModelOutputBase
+
+Tensor = Union['torch.Tensor', 'tf.Tensor']
+
+
+@dataclass
+class TextClassificationModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        logits (`Tensor`): The logits output of the model. loss (`Tensor`,
+        *optional*) The loss of the model, available when training.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
+        output of each layer plus the optional initial embedding outputs.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+
+
+@dataclass
+class TokenClassificationModelOutput(ModelOutputBase):
+    """The output class for token classification models.
+        logits (`Tensor`): The logits output of the model.
+        loss (`Tensor`, *optional*) The loss of the model, available when training.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+    offset_mapping: Tensor = None
+
+
+@dataclass
+class FillMaskModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        logits (`Tensor`): The logits output of the model.
+        loss (`Tensor`, *optional*) The loss of the model, available when training.
+        input_ids (`Tensor`, *optional*) The input id tensor fed into the model.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at the
+            output of each layer plus the optional initial embedding outputs.
+    """
+
+    logits: Tensor = None
+    loss: Tensor = None
+    input_ids: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class TokenClassifierOutput(ModelOutputBase):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+        `labels` is provided) :
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
+        config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+
+    """
+
+    loss: Tensor = None
+    logits: Tensor = None
+    hidden_states: Tensor = None
+    attentions: Tensor = None
+    offset_mapping: Tensor = None
+
+
+@dataclass
+class TokenClassifierWithPredictionsOutput(ModelOutputBase):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+        `labels` is provided) :
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
+        config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        predictions: A PyTorch tensor of the best tag sequence for each batch of shape
+            (nbest, batch_size, seq_length)
+
+    """
+
+    loss: Tensor = None
+    logits: Tensor = None
+    hidden_states: Tensor = None
+    attentions: Tensor = None
+    offset_mapping: Tensor = None
+    predictions: Tensor = None
+
+
+@dataclass
+class BaseModelOutput(ModelOutputBase):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    last_hidden_state: Tensor = None
+    hidden_states: Optional[Tuple[Tensor]] = None
+    attentions: Optional[Tuple[Tensor]] = None
+
+
+@dataclass
+class BackboneModelOutput(ModelOutputBase):
+    """The output class for text classification models.
+
+    Args:
+        last_hidden_state (`Tensor`, *optional*): Sequence of hidden-states at
+            the output of the last layer of the model.
+        pooler_output (`Tensor`, *optional*) The tensor of the pooled hidden state.
+        hidden_states (`Tensor`, *optional*) Hidden-states of the model at
+            the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: Tensor = None
+    pooler_output: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionBackboneModelOutput(BackboneModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+    past_key_values: Tensor = None
+    cross_attentions: Tensor = None
+
+
+@dataclass
+class AttentionTextClassificationModelOutput(TextClassificationModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput):
+    """The output class for backbones of attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the attention softmax,
+        used to compute the weighted average in the self-attention heads.
+    """
+    attentions: Tensor = None
+    hidden_states: Tensor = None
+
+
+@dataclass
+class AttentionFillMaskModelOutput(FillMaskModelOutput):
+    """The output class for the fill mask and attention based models.
+
+    Args:
+        attentions (`tuple(Tensor)`, *optional* Attentions weights after the
+        attention softmax, used to compute the weighted average in the
+        self-attention heads.
+    """
+    attentions: Tensor = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutputBase):
+    """
+    Base class for model's outputs that also contains a pooling of the last
+    hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size,
+        hidden_size)`):
+            Last layer hidden-state of the first token of the sequence
+            (classification token) after further processing through the layers
+            used for the auxiliary pretraining task. E.g. for BERT-family of
+            models, this returns the classification token after processing
+            through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction
+            (classification) objective during pretraining.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` and `config.add_cross_attention=True` is passed
+        or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that
+            can be used (see `past_key_values` input) to speed up sequential
+            decoding.
+    """
+
+    last_hidden_state: Tensor = None
+    pooler_output: Tensor = None
+    hidden_states: Tensor = None
+    past_key_values: Tensor = None
+    attentions: Tensor = None
+    cross_attentions: Tensor = None
+
+
+@dataclass
+class BaseModelOutputWithPastAndCrossAttentions(ModelOutputBase):
+    """
+    Base class for model's outputs that may also contain a past key/values (to
+    speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            model.
+
+            If `past_key_values` is used only the last hidden-state of the
+            sequences of shape `(batch_size, 1, hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that
+            can be used (see `past_key_values` input) to speed up sequential
+            decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` and `config.add_cross_attention=True` is passed
+        or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+    """
+
+    last_hidden_state: Tensor = None
+    past_key_values: Tensor = None
+    hidden_states: Tensor = None
+    attentions: Tensor = None
+    cross_attentions: Tensor = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutputBase):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed
+    hidden states that can speed up sequential decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+            decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the
+            sequences of shape `(batch_size, 1, hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and in the cross-attention blocks) that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the
+            optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the
+            optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+    """
+
+    last_hidden_state: Tensor = None
+    past_key_values: Optional[Tuple[Tuple[Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tensor]] = None
+    cross_attentions: Optional[Tuple[Tensor]] = None
+    encoder_last_hidden_state: Optional[Tensor] = None
+    encoder_hidden_states: Optional[Tuple[Tensor]] = None
+    encoder_attentions: Optional[Tuple[Tensor]] = None
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutputBase):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+        `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,
+        config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each
+            vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned
+        when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`,
+            with each tuple having 2 tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape `(batch_size, num_heads, encoder_sequence_length,
+            embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the
+            self-attention blocks and in the cross-attention blocks) that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the
+            initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
+        `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the
+            attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings,
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the
+            initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used
+            to compute the weighted average in the self-attention heads.
+    """
+
+    loss: Optional[Tensor] = None
+    logits: Tensor = None
+    past_key_values: Optional[Tuple[Tuple[Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tensor]] = None
+    decoder_attentions: Optional[Tuple[Tensor]] = None
+    cross_attentions: Optional[Tuple[Tensor]] = None
+    encoder_last_hidden_state: Optional[Tensor] = None
+    encoder_hidden_states: Optional[Tuple[Tensor]] = None
+    encoder_attentions: Optional[Tuple[Tensor]] = None
diff --git a/modelscope/outputs.py b/modelscope/outputs/outputs.py
similarity index 93%
rename from modelscope/outputs.py
rename to modelscope/outputs/outputs.py
index 34bde76a..721fb271 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from collections import OrderedDict, namedtuple
+from dataclasses import dataclass, fields
 
 from modelscope.utils.constant import Tasks
 
@@ -488,7 +490,6 @@ TASK_OUTPUTS = {
     # ]
     # }
     Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
-    Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS],
 
     # TODO @wenmeng.zwm support list of result check
     # named entity recognition result for single sample
@@ -499,6 +500,7 @@ TASK_OUTPUTS = {
     #   ]
     # }
     Tasks.named_entity_recognition: [OutputKeys.OUTPUT],
+    Tasks.part_of_speech: [OutputKeys.OUTPUT],
 
     # text_error_correction result for a single sample
     # {
@@ -779,3 +781,60 @@ TASK_OUTPUTS = {
     # }
     Tasks.product_segmentation: [OutputKeys.MASKS],
 }
+
+
+class ModelOutputBase(list):
+
+    def __post_init__(self):
+        self.reconstruct()
+        self.post_init = True
+
+    def reconstruct(self):
+        # Low performance, but low frequency.
+        self.clear()
+        for idx, key in enumerate(self.keys()):
+            self.append(getattr(self, key))
+
+    def __getitem__(self, item):
+        if isinstance(item, str):
+            if hasattr(self, item):
+                return getattr(self, item)
+        elif isinstance(item, (int, slice)):
+            return super().__getitem__(item)
+        raise IndexError(f'No Index {item} found in the dataclass.')
+
+    def __setitem__(self, key, value):
+        if isinstance(key, str):
+            if key in [f.name for f in fields(self)]:
+                if key not in self.keys():
+                    super().__setattr__(key, value)
+                    self.reconstruct()
+                elif id(getattr(self, key)) != id(value):
+                    super().__setattr__(key, value)
+                    super().__setitem__(self.keys().index(key), value)
+            else:
+                super().__setattr__(key, value)
+        elif isinstance(key, int):
+            super().__setitem__(key, value)
+            key_name = self.keys()[key]
+            super().__setattr__(key_name, value)
+
+    def __setattr__(self, key, value):
+        if getattr(self, 'post_init', False):
+            return self.__setitem__(key, value)
+        else:
+            return super().__setattr__(key, value)
+
+    def keys(self):
+        return [
+            f.name for f in fields(self) if getattr(self, f.name) is not None
+        ]
+
+    def items(self):
+        return self.to_dict().items()
+
+    def to_dict(self):
+        output = OrderedDict()
+        for key in self.keys():
+            output[key] = getattr(self, key)
+        return output
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 644749fc..bca80502 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -33,7 +33,7 @@ if is_tf_available():
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, tuple, MsDataset, 'Image.Image', 'numpy.ndarray']
-InputModel = Union[str, Model]
+InputModel = Union[str, Model, 'torch.nn.Module']
 
 logger = get_logger()
 
@@ -49,13 +49,7 @@ class Pipeline(ABC):
             return Model.from_pretrained(
                 model, model_prefetched=True,
                 device=self.device_name) if is_model(model) else model
-        elif isinstance(model, Model):
-            return model
         else:
-            if model and not isinstance(model, str):
-                raise ValueError(
-                    f'model type for single model is either str or Model, but got type {type(model)}'
-                )
             return model
 
     def initiate_multiple_models(self, input_models: List[InputModel]):
@@ -139,12 +133,10 @@ class Pipeline(ABC):
     def _get_framework(self) -> str:
         frameworks = []
         for m in self.models:
-            if isinstance(m, Model):
-                model_dir = m.model_dir
-            else:
-                assert isinstance(m,
-                                  str), 'model should be either str or Model.'
+            if isinstance(m, str):
                 model_dir = m
+            else:
+                model_dir = m.model_dir
             cfg_file = osp.join(model_dir, ModelFile.CONFIGURATION)
             cfg = Config.from_file(cfg_file)
             frameworks.append(cfg.framework)
@@ -387,10 +379,13 @@ class DistributedPipeline(Pipeline):
     def _instantiate_one(cls, rank, model_dir, **kwargs):
         """Instantiate one model piece.
 
-        @param rank: The model rank.
-        @param model_dir: The model_dir in the node.
-        @param kwargs: Any extra args.
-        @return: None. The model handler should be kept in the class field.
+        Args:
+            rank: The model rank.
+            model_dir: The model_dir in the node.
+            kwargs: Any extra args.
+
+        Returns:
+            None. The model handler should be kept in the class field.
         """
         pass
 
@@ -410,8 +405,11 @@ class DistributedPipeline(Pipeline):
 
         Use the model handler kept in the class field to forward.
 
-        @param inputs: The inputs after the preprocessing.
-        @return: The forward results.
+        Args:
+            inputs: The inputs after the preprocessing.
+
+        Returns:
+            The forward results.
         """
         pass
 
@@ -429,7 +427,7 @@ def collate_fn(data, device):
 
     """
     from torch.utils.data.dataloader import default_collate
-    from modelscope.preprocessors import InputFeatures
+    from modelscope.preprocessors.nlp import InputFeatures
     if isinstance(data, dict) or isinstance(data, Mapping):
         return type(data)({k: collate_fn(v, device) for k, v in data.items()})
     elif isinstance(data, (tuple, list)):
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index e1583387..498c9ed8 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -285,9 +285,6 @@ def pipeline(task: str = None,
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
 
-    assert isinstance(model, (type(None), str, Model, list)), \
-        f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
-
     model = normalize_model_input(model, model_revision)
     if pipeline_name is None:
         # get default pipeline for this task
@@ -304,8 +301,7 @@ def pipeline(task: str = None,
             else:
                 # used for test case, when model is str and is not hub path
                 pipeline_name = get_pipeline_by_model_name(task, model)
-        elif isinstance(model, Model) or \
-                (isinstance(model, list) and isinstance(model[0], Model)):
+        elif model is not None:
             # get pipeline info from Model object
             first_model = model[0] if isinstance(model, list) else model
             if not hasattr(first_model, 'pipeline'):
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 677151c0..73bd0d8c 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .automatic_post_editing_pipeline import AutomaticPostEditingPipeline
     from .conversational_text_to_sql_pipeline import ConversationalTextToSqlPipeline
+    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .dialog_intent_prediction_pipeline import DialogIntentPredictionPipeline
     from .dialog_modeling_pipeline import DialogModelingPipeline
     from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
@@ -14,16 +15,13 @@ if TYPE_CHECKING:
     from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
     from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
-    from .fill_mask_ponet_pipeline import FillMaskPonetPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
     from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
     from .text_ranking_pipeline import TextRankingPipeline
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
-    from .sequence_classification_pipeline import SequenceClassificationPipeline
+    from .text_classification_pipeline import TextClassificationPipeline
     from .summarization_pipeline import SummarizationPipeline
-    from .table_question_answering_pipeline import TableQuestionAnsweringPipeline
     from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline
-    from .text_classification_pipeline import TextClassificationPipeline
     from .text_error_correction_pipeline import TextErrorCorrectionPipeline
     from .text_generation_pipeline import TextGenerationPipeline
     from .text2text_generation_pipeline import Text2TextGenerationPipeline
@@ -47,13 +45,11 @@ else:
         'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
         'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
-        'fill_mask_ponet_pipeline': ['FillMaskPoNetPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
         'named_entity_recognition_pipeline':
         ['NamedEntityRecognitionPipeline'],
         'text_ranking_pipeline': ['TextRankingPipeline'],
         'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
-        'sequence_classification_pipeline': ['SequenceClassificationPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
         'table_question_answering_pipeline':
         ['TableQuestionAnsweringPipeline'],
diff --git a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
index 73c6429d..48df0c40 100644
--- a/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
+++ b/modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
@@ -11,8 +11,6 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
-from modelscope.preprocessors.star.fields import (SubPreprocessor,
-                                                  process_tables)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['ConversationalTextToSqlPipeline']
@@ -39,17 +37,6 @@ class ConversationalTextToSqlPipeline(Pipeline):
         if preprocessor is None:
             preprocessor = ConversationalTextToSqlPreprocessor(model.model_dir)
 
-        preprocessor.device = 'cuda' if \
-            ('device' not in kwargs or kwargs['device'] == 'gpu') \
-            and torch.cuda.is_available() else 'cpu'
-        use_device = True if preprocessor.device == 'cuda' else False
-        preprocessor.processor = \
-            SubPreprocessor(model_dir=model.model_dir,
-                            db_content=True,
-                            use_gpu=use_device)
-        preprocessor.output_tables = \
-            process_tables(preprocessor.processor,
-                           preprocessor.tables)
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
index 79d32ace..9520c06f 100644
--- a/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, Union
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SpaceForDialogStateTracking
+from modelscope.models.nlp import SpaceForDST
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
@@ -20,7 +20,7 @@ __all__ = ['DialogStateTrackingPipeline']
 class DialogStateTrackingPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[SpaceForDialogStateTracking, str],
+                 model: Union[SpaceForDST, str],
                  preprocessor: DialogStateTrackingPreprocessor = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a dialog state tracking pipeline for
@@ -33,8 +33,7 @@ class DialogStateTrackingPipeline(Pipeline):
         """
 
         model = model if isinstance(
-            model,
-            SpaceForDialogStateTracking) else Model.from_pretrained(model)
+            model, SpaceForDST) else Model.from_pretrained(model)
         self.model = model
         if preprocessor is None:
             preprocessor = DialogStateTrackingPreprocessor(model.model_dir)
diff --git a/modelscope/pipelines/nlp/distributed_plug_pipeline.py b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
index e5c05e86..8499f7ff 100644
--- a/modelscope/pipelines/nlp/distributed_plug_pipeline.py
+++ b/modelscope/pipelines/nlp/distributed_plug_pipeline.py
@@ -27,7 +27,8 @@ class DistributedPlugPipeline(DistributedPipeline):
                  **kwargs):
         """Create a plug pipeline instance.
 
-        @param model: The model_id of plug(damo/nlp_plug_text-generation_27B).
+        Args:
+            model: The model_id of plug(damo/nlp_plug_text-generation_27B).
         The default path to damo/nlp_plug_text-generation_27B can be obtained by function
         get_cache_dir("damo/nlp_plug_text-generation_27B"), the model should be downloaded to
         this path before calling this class by model_id.
@@ -52,11 +53,11 @@ class DistributedPlugPipeline(DistributedPipeline):
                 |_ mp_rank_05_model_states.pt
                 |_ mp_rank_06_model_states.pt
                 |_ mp_rank_07_model_states.pt
-        @param preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
+            preprocessor: The optional preprocessor, if not passed in, a TextGenerationPreprocessor will
             be used as default.
-        @param first_sequence: The first_sequence key name if the input format is a dict.
-        @param kwargs:
-            sequence_length: The input sequence_length.
+            first_sequence: The first_sequence key name if the input format is a dict.
+            kwargs:
+                sequence_length: The input sequence_length.
         """
         if preprocessor is None:
             preprocessor = TextGenerationPreprocessor(
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 1d46d8fd..fd614e91 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -2,15 +2,12 @@
 
 from typing import Any, Dict, Union
 
-import torch
-
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.nlp import SbertForFaqQuestionAnswering
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['FaqQuestionAnsweringPipeline']
@@ -21,19 +18,19 @@ __all__ = ['FaqQuestionAnsweringPipeline']
 class FaqQuestionAnsweringPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[str, SbertForFaqQuestionAnswering],
-                 preprocessor: FaqQuestionAnsweringPreprocessor = None,
+                 model: Union[str, Model],
+                 preprocessor: Preprocessor = None,
                  **kwargs):
-        model = model if isinstance(
-            model,
-            SbertForFaqQuestionAnswering) else Model.from_pretrained(model)
-        model.eval()
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
         if preprocessor is None:
-            preprocessor = FaqQuestionAnsweringPreprocessor(
+            preprocessor = Preprocessor.from_pretrained(
                 model.model_dir, **kwargs)
-        self.preprocessor = preprocessor
-        super(FaqQuestionAnsweringPipeline, self).__init__(
-            model=model, preprocessor=preprocessor, **kwargs)
+            if preprocessor is None:
+                from modelscope.preprocessors import FaqQuestionAnsweringPreprocessor
+                preprocessor = FaqQuestionAnsweringPreprocessor(
+                    model.model_dir, **kwargs)
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, pipeline_parameters, pipeline_parameters
@@ -46,8 +43,7 @@ class FaqQuestionAnsweringPipeline(Pipeline):
 
     def forward(self, inputs: [list, Dict[str, Any]],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(inputs)
+        return self.model(inputs)
 
     def postprocess(self, inputs: [list, Dict[str, Any]],
                     **postprocess_params) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 3d515e2d..0f3446e6 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,145 +1,103 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import os
 from typing import Any, Dict, Optional, Union
 
-import torch
+import numpy as np
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import NLPPreprocessor, Preprocessor
-from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Tasks
 
 __all__ = ['FillMaskPipeline']
-_type_map = {
-    'veco': 'roberta',
-    'sbert': 'bert',
-}
 
 
 @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
+@PIPELINES.register_module(
+    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
 class FillMaskPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
                  preprocessor: Optional[Preprocessor] = None,
-                 first_sequence='sentence',
+                 first_sequence: str = 'sentence',
                  **kwargs):
-        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
+        """The inference pipeline for all the fill mask sub-tasks.
 
         Args:
-            model (str or Model): Supply either a local model dir which supported mlm task, or a
-            mlm model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the sentence in.
-            sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            first_sequence (`str`， `optional`): The key to read the sentence in.
+            sequence_length (`int`， `optional`): Max sequence length in the user's custom scenario, default 128.
+
+            NOTE1: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
             param will have no effect.
 
-            Example:
+            Example1:
             >>> from modelscope.pipelines import pipeline
             >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_structbert_fill-mask_english-large')
             >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
             >>> print(pipeline_ins(input))
+            Example2:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
+            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
+            >>> print(pipeline_ins(input))
 
             NOTE2: Please pay attention to the model's special tokens.
             If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
             If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
             To view other examples plese check the tests/pipelines/test_fill_mask.py.
         """
-        fill_mask_model = model if isinstance(
-            model, Model) else Model.from_pretrained(model)
+
+        fill_mask_model = Model.from_pretrained(model) if isinstance(
+            model, str) else model
 
         if preprocessor is None:
-            preprocessor = NLPPreprocessor(
+            preprocessor = Preprocessor.from_pretrained(
                 fill_mask_model.model_dir,
                 first_sequence=first_sequence,
                 second_sequence=None,
                 sequence_length=kwargs.pop('sequence_length', 128))
         fill_mask_model.eval()
+        assert hasattr(
+            preprocessor, 'mask_id'
+        ), 'The input preprocessor should have the mask_id attribute.'
         super().__init__(
             model=fill_mask_model, preprocessor=preprocessor, **kwargs)
 
-        self.preprocessor = preprocessor
-        self.config = Config.from_file(
-            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
-        self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103, 'deberta_v2': 4}
-
-        self.rep_map = {
-            'bert': {
-                '[unused0]': '',
-                '[PAD]': '',
-                '[unused1]': '',
-                r' +': ' ',
-                '[SEP]': '',
-                '[unused2]': '',
-                '[CLS]': '',
-                '[UNK]': ''
-            },
-            'roberta': {
-                r' +': ' ',
-                '<mask>': '<q>',
-                '<pad>': '',
-                '<s>': '',
-                '</s>': '',
-                '<unk>': ' '
-            },
-            'deberta_v2': {
-                '[PAD]': '',
-                r' +': ' ',
-                '[SEP]': '',
-                '[CLS]': '',
-                '[UNK]': ''
-            },
-        }
-
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): _description_
-
+            inputs (Dict[str, Any]): The model outputs.
+            The output should follow some rules:
+                1. Values can be retrieved by keys(dict-like, or the __getitem__ method is overriden)
+                2. 'logits' and 'input_ids' key exists.
+            Models in modelscope will return the output dataclass `modelscope.outputs.FillMaskModelOutput`.
         Returns:
             Dict[str, str]: the prediction results
         """
-        import numpy as np
         logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
         input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
         pred_ids = np.argmax(logits, axis=-1)
-        if hasattr(self.model.config, 'backbone'):
-            model_type = self.model.config.backbone.type
-        else:
-            model_type = self.model.config.model_type
-        process_type = model_type if model_type in self.mask_id else _type_map[
-            model_type]
-        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
+        rst_ids = np.where(input_ids == self.preprocessor.mask_id, pred_ids,
                            input_ids)
 
-        def rep_tokens(string, rep_map):
-            for k, v in rep_map.items():
-                string = string.replace(k, v)
-            return string.strip()
-
         pred_strings = []
         for ids in rst_ids:  # batch
-            if 'language' in self.config.model and self.config.model.language == 'zh':
-                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
-                pred_string = ''.join(pred_string)
-            else:
-                pred_string = self.tokenizer.decode(ids)
-            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
+            pred_string = self.preprocessor.decode(
+                ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True)
             pred_strings.append(pred_string)
 
         return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py b/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
deleted file mode 100644
index 9770fc38..00000000
--- a/modelscope/pipelines/nlp/fill_mask_ponet_pipeline.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import os
-from typing import Any, Dict, Optional, Union
-
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import FillMaskPoNetPreprocessor, Preprocessor
-from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
-
-__all__ = ['FillMaskPonetPipeline']
-_type_map = {'ponet': 'bert'}
-
-
-@PIPELINES.register_module(
-    Tasks.fill_mask, module_name=Pipelines.fill_mask_ponet)
-class FillMaskPonetPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Optional[Preprocessor] = None,
-                 first_sequence='sentence',
-                 **kwargs):
-        """Use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
-
-        Args:
-            model (str or Model): Supply either a local model dir which supported fill-mask task,
-            or a fill-mask model id from the model hub, or a torch model instance.
-            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
-            the model if supplied.
-            first_sequence: The key to read the sentence in.
-
-            NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence'
-            param will have no effect.
-
-            Example:
-            >>> from modelscope.pipelines import pipeline
-            >>> pipeline_ins = pipeline(
-                    'fill-mask', model='damo/nlp_ponet_fill-mask_english-base')
-            >>> input = 'Everything in [MASK] you call reality is really [MASK] a reflection of your [MASK].'
-            >>> print(pipeline_ins(input))
-
-            NOTE2: Please pay attention to the model's special tokens.
-            If bert based model(bert, structbert, etc.) is used, the mask token is '[MASK]'.
-            If the xlm-roberta(xlm-roberta, veco, etc.) based model is used, the mask token is '<mask>'.
-            To view other examples plese check the tests/pipelines/test_fill_mask.py.
-        """
-        fill_mask_model = model if isinstance(
-            model, Model) else Model.from_pretrained(model)
-
-        self.config = Config.from_file(
-            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
-
-        if preprocessor is None:
-            preprocessor = FillMaskPoNetPreprocessor(
-                fill_mask_model.model_dir,
-                first_sequence=first_sequence,
-                second_sequence=None,
-                sequence_length=kwargs.pop('sequence_length', 512))
-
-        fill_mask_model.eval()
-        super().__init__(
-            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
-
-        self.preprocessor = preprocessor
-
-        self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'roberta': 250001, 'bert': 103}
-
-        self.rep_map = {
-            'bert': {
-                '[unused0]': '',
-                '[PAD]': '',
-                '[unused1]': '',
-                r' +': ' ',
-                '[SEP]': '',
-                '[unused2]': '',
-                '[CLS]': '',
-                '[UNK]': ''
-            },
-            'roberta': {
-                r' +': ' ',
-                '<mask>': '<q>',
-                '<pad>': '',
-                '<s>': '',
-                '</s>': '',
-                '<unk>': ' '
-            }
-        }
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
-
-    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-        import numpy as np
-        logits = inputs[OutputKeys.LOGITS].detach().cpu().numpy()
-        input_ids = inputs[OutputKeys.INPUT_IDS].detach().cpu().numpy()
-        pred_ids = np.argmax(logits, axis=-1)
-        model_type = self.model.config.model_type
-        process_type = model_type if model_type in self.mask_id else _type_map[
-            model_type]
-        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
-                           input_ids)
-
-        def rep_tokens(string, rep_map):
-            for k, v in rep_map.items():
-                string = string.replace(k, v)
-            return string.strip()
-
-        pred_strings = []
-        for ids in rst_ids:  # batch
-            if 'language' in self.config.model and self.config.model.language == 'zh':
-                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
-                pred_string = ''.join(pred_string)
-            else:
-                pred_string = self.tokenizer.decode(ids)
-            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
-            pred_strings.append(pred_string)
-
-        return {OutputKeys.TEXT: pred_strings}
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 7275feca..8d8c4542 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -12,6 +12,8 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
 __all__ = ['NamedEntityRecognitionPipeline']
 
@@ -59,37 +61,68 @@ class NamedEntityRecognitionPipeline(Pipeline):
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
         with torch.no_grad():
-            return super().forward(inputs, **forward_params)
+            return {
+                **self.model(**inputs, **forward_params), OutputKeys.TEXT: text
+            }
 
     def postprocess(self, inputs: Dict[str, Any],
                     **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): should be tensors from model
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
         text = inputs['text']
+        if OutputKeys.PREDICTIONS not in inputs:
+            logits = inputs[OutputKeys.LOGITS]
+            predictions = torch.argmax(logits[0], dim=-1)
+        else:
+            predictions = inputs[OutputKeys.PREDICTIONS].squeeze(
+                0).cpu().numpy()
+        predictions = torch_nested_numpify(torch_nested_detach(predictions))
         offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
-        labels = [self.id2label[x] for x in inputs['predicts']]
-        entities = []
-        entity = {}
+
+        labels = [self.id2label[x] for x in predictions]
+        chunks = []
+        chunk = {}
         for label, offsets in zip(labels, offset_mapping):
             if label[0] in 'BS':
-                if entity:
-                    entity['span'] = text[entity['start']:entity['end']]
-                    entities.append(entity)
-                entity = {
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                chunk = {
                     'type': label[2:],
                     'start': offsets[0],
                     'end': offsets[1]
                 }
             if label[0] in 'IES':
-                if entity:
-                    entity['end'] = offsets[1]
+                if chunk:
+                    chunk['end'] = offsets[1]
+
             if label[0] in 'ES':
-                if entity:
-                    entity['span'] = text[entity['start']:entity['end']]
-                    entities.append(entity)
-                    entity = {}
-        if entity:
-            entity['span'] = text[entity['start']:entity['end']]
-            entities.append(entity)
-        outputs = {OutputKeys.OUTPUT: entities}
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                    chunk = {}
+
+        if chunk:
+            chunk['span'] = text[chunk['start']:chunk['end']]
+            chunks.append(chunk)
+
+        # for cws output
+        if len(chunks) > 0 and chunks[0]['type'] == 'cws':
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = ' '.join(spans)
+            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
 
+        # for ner outpus
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
diff --git a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
index 16dedb2e..cfa5c2f1 100644
--- a/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_embedding_pipeline.py
@@ -2,15 +2,14 @@
 
 from typing import Any, Dict, Optional, Union
 
-import torch
+import numpy as np
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      SentenceEmbeddingPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
 
 __all__ = ['SentenceEmbeddingPipeline']
@@ -33,20 +32,18 @@ class SentenceEmbeddingPipeline(Pipeline):
             the model if supplied.
             sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
         """
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
         if preprocessor is None:
-            preprocessor = SentenceEmbeddingPreprocessor(
+            preprocessor = Preprocessor.from_pretrained(
                 model.model_dir if isinstance(model, Model) else model,
                 first_sequence=first_sequence,
                 sequence_length=kwargs.pop('sequence_length', 128))
-        model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return {**self.model(inputs, **forward_params)}
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """process the prediction results
@@ -57,6 +54,11 @@ class SentenceEmbeddingPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the predicted text representation
         """
-        embs = inputs[OutputKeys.TEXT_EMBEDDING]
-        scores = inputs[OutputKeys.SCORES]
+        embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
+        num_sent = embs.shape[0]
+        if num_sent >= 2:
+            scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
+                                                      (1, 0))).tolist()[0]
+        else:
+            scores = []
         return {OutputKeys.TEXT_EMBEDDING: embs, OutputKeys.SCORES: scores}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
deleted file mode 100644
index 69f6217a..00000000
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, Union
-
-import numpy as np
-import torch
-
-from modelscope.metainfo import Pipelines
-from modelscope.models.base import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      SequenceClassificationPreprocessor)
-from modelscope.utils.constant import Tasks
-
-
-@PIPELINES.register_module(
-    Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
-@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
-@PIPELINES.register_module(
-    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
-@PIPELINES.register_module(
-    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
-class SequenceClassificationPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Preprocessor = None,
-                 **kwargs):
-        """This is the base class for all the sequence classification sub-tasks.
-
-        Args:
-            model (str or Model): A model instance or a model local dir or a model id in the model hub.
-            preprocessor (Preprocessor): a preprocessor instance, must not be None.
-        """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
-        first_sequence = kwargs.pop('first_sequence', 'first_sequence')
-        second_sequence = kwargs.pop('second_sequence', None)
-
-        if preprocessor is None:
-            preprocessor = SequenceClassificationPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
-                first_sequence=first_sequence,
-                second_sequence=second_sequence,
-                sequence_length=kwargs.pop('sequence_length', 512))
-
-        assert preprocessor is not None
-        model.eval()
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.id2label = kwargs.get('id2label')
-        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
-            self.id2label = self.preprocessor.id2label
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
-
-    def forward(self, inputs: Dict[str, Any],
-                **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
-
-    def postprocess(self,
-                    inputs: Dict[str, Any],
-                    topk: int = 5) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-            topk (int): The topk probs to take
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-
-        probs = inputs[OutputKeys.PROBABILITIES][0]
-        num_classes = probs.shape[0]
-        topk = min(topk, num_classes)
-        top_indices = np.argpartition(probs, -topk)[-topk:]
-        cls_ids = top_indices[np.argsort(probs[top_indices])]
-        probs = probs[cls_ids].tolist()
-
-        cls_names = [self.id2label[cid] for cid in cls_ids]
-        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: cls_names}
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index fc0d07b1..826e35a9 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -13,9 +13,9 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
-from modelscope.preprocessors.space_T_cn.fields.database import Database
-from modelscope.preprocessors.space_T_cn.fields.struct import (Constant,
-                                                               SQLQuery)
+from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database
+from modelscope.preprocessors.nlp.space_T_cn.fields.struct import (Constant,
+                                                                   SQLQuery)
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index 13d9964d..9e00ad7f 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -1,43 +1,124 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Union
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
 from modelscope.models.multi_modal import OfaForAllTasks
-from modelscope.pipelines.base import Model, Pipeline
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
 
 
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+@PIPELINES.register_module(
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
 @PIPELINES.register_module(
     Tasks.text_classification, module_name=Pipelines.text_classification)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentiment_classification)
+@PIPELINES.register_module(
+    Tasks.text_classification, module_name=Pipelines.sentence_similarity)
+@PIPELINES.register_module(
+    Tasks.sentiment_classification,
+    module_name=Pipelines.sentiment_classification)
 class TextClassificationPipeline(Pipeline):
 
     def __init__(self,
                  model: Union[Model, str],
-                 preprocessor: [Preprocessor] = None,
+                 preprocessor: Preprocessor = None,
                  **kwargs):
+        """The inference pipeline for all the text classification sub-tasks.
+
+        Args:
+            model (`str` or `Model` or module instance): A model instance or a model local dir
+                or a model id in the model hub.
+            preprocessor (`Preprocessor`, `optional`): A Preprocessor instance.
+            first_sequence (`str`, `optional`): The key of the first sentence.
+            second_sequence (`str`, `optional`): The key of the second sentence.
+            sequence_length (`int`, `optional`): The sequence length.
+            id2label (`dict`, `optional`): The id-label mapping.
+
+        Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification',
+                model='damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> input = ('这是个测试', '这也是个测试')
+            >>> print(pipeline_ins(input))
+
+        NOTE: Inputs of type 'str' are also supported. In this scenario, the 'first_sequence' and 'second_sequence'
+            param will have no affection.
         """
-        use `model` and `preprocessor` to create a kws pipeline for prediction
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
+
+        if preprocessor is None:
+            if isinstance(model, OfaForAllTasks):
+                preprocessor = OfaPreprocessor(model_dir=model.model_dir)
+            else:
+                first_sequence = kwargs.pop('first_sequence', 'first_sequence')
+                second_sequence = kwargs.pop('second_sequence', None)
+                preprocessor = Preprocessor.from_pretrained(
+                    model if isinstance(model, str) else model.model_dir,
+                    first_sequence=first_sequence,
+                    second_sequence=second_sequence,
+                    sequence_length=kwargs.pop('sequence_length', 512))
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        if isinstance(self.model, OfaForAllTasks):
+            return super().forward(inputs, **forward_params)
+        return self.model(**inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
+        """process the prediction results
+
         Args:
-            model: model id on modelscope hub.
+            inputs (`Dict[str, Any]` or `TextClassificationModelOutput`): The model output, please check
+                the `TextClassificationModelOutput` class for details.
+            topk (int): The topk probs to take
+        Returns:
+            Dict[str, str]: the prediction results.
+                scores: The probabilities of each label.
+                labels: The real labels.
+            Label at index 0 is the smallest probability.
         """
-        super().__init__(model=model)
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or OfaForAllTasks'
-        if isinstance(model, str):
-            pipe_model = Model.from_pretrained(model)
-        elif isinstance(model, Model):
-            pipe_model = model
+        if isinstance(self.model, OfaForAllTasks):
+            return inputs
         else:
-            raise NotImplementedError
-        pipe_model.model.eval()
-        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
-            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
-        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return inputs
+            assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
+                                              'as a parameter or make sure the preprocessor has the attribute.'
+            logits = inputs[OutputKeys.LOGITS].cpu().numpy()
+            if logits.shape[0] == 1:
+                logits = logits[0]
+
+            def softmax(logits):
+                exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+                return exp / exp.sum(axis=-1, keepdims=True)
+
+            probs = softmax(logits)
+            num_classes = probs.shape[-1]
+            topk = min(topk, num_classes)
+            top_indices = np.argpartition(probs, -topk)[-topk:]
+            probs = np.take_along_axis(probs, top_indices, axis=-1).tolist()
+
+            def map_to_label(id):
+                return self.id2label[id]
+
+            v_func = np.vectorize(map_to_label)
+            return {
+                OutputKeys.SCORES: probs,
+                OutputKeys.LABELS: v_func(top_indices).tolist()
+            }
diff --git a/modelscope/pipelines/nlp/text_ranking_pipeline.py b/modelscope/pipelines/nlp/text_ranking_pipeline.py
index 4aa57238..9cee327b 100644
--- a/modelscope/pipelines/nlp/text_ranking_pipeline.py
+++ b/modelscope/pipelines/nlp/text_ranking_pipeline.py
@@ -2,7 +2,7 @@
 
 from typing import Any, Dict, Optional, Union
 
-import torch
+import numpy as np
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
@@ -32,20 +32,18 @@ class TextRankingPipeline(Pipeline):
             the model if supplied.
             sequence_length: Max sequence length in the user's custom scenario. 128 will be used as a default value.
         """
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
 
         if preprocessor is None:
-            preprocessor = TextRankingPreprocessor(
-                model.model_dir if isinstance(model, Model) else model,
+            preprocessor = Preprocessor.from_pretrained(
+                model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 128))
-        model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return {**self.model(inputs, **forward_params)}
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """process the prediction results
@@ -55,6 +53,10 @@ class TextRankingPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the predicted text representation
         """
-        pred_list = inputs[OutputKeys.SCORES]
 
+        def sigmoid(logits):
+            return np.exp(logits) / (1 + np.exp(logits))
+
+        logits = inputs[OutputKeys.LOGITS].squeeze(-1).detach().cpu().numpy()
+        pred_list = sigmoid(logits).tolist()
         return {OutputKeys.SCORES: pred_list}
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 055a4b8a..c36f0dfc 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -7,17 +7,22 @@ import torch
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
-                                      TokenClassificationPreprocessor)
+from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Tasks
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
 __all__ = ['TokenClassificationPipeline']
 
 
 @PIPELINES.register_module(
     Tasks.token_classification, module_name=Pipelines.part_of_speech)
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.word_segmentation)
+@PIPELINES.register_module(
+    Tasks.token_classification, module_name=Pipelines.named_entity_recognition)
 @PIPELINES.register_module(
     Tasks.part_of_speech, module_name=Pipelines.part_of_speech)
 class TokenClassificationPipeline(Pipeline):
@@ -32,24 +37,18 @@ class TokenClassificationPipeline(Pipeline):
             model (str or Model): A model instance or a model local dir or a model id in the model hub.
             preprocessor (Preprocessor): a preprocessor instance, must not be None.
         """
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or Model'
-        model = model if isinstance(model,
-                                    Model) else Model.from_pretrained(model)
+        model = Model.from_pretrained(model) if isinstance(model,
+                                                           str) else model
+
         if preprocessor is None:
-            preprocessor = TokenClassificationPreprocessor(
+            preprocessor = Model.from_pretrained(
                 model.model_dir,
                 sequence_length=kwargs.pop('sequence_length', 128))
         model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        if hasattr(model, 'id2label'):
-            self.id2label = getattr(model, 'id2label')
-        else:
-            model_config = getattr(model, 'config')
-            self.id2label = getattr(model_config, 'id2label')
-
-        assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \
-                                          'as a parameter or make sure the preprocessor has the attribute.'
+        self.id2label = kwargs.get('id2label')
+        if self.id2label is None and hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
@@ -64,38 +63,59 @@ class TokenClassificationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): _description_
+            inputs (Dict[str, Any]): should be tensors from model
 
         Returns:
             Dict[str, str]: the prediction results
         """
+        text = inputs['text']
+        if not hasattr(inputs, 'predictions'):
+            logits = inputs[OutputKeys.LOGITS]
+            predictions = torch.argmax(logits[0], dim=-1)
+        else:
+            predictions = inputs[OutputKeys.PREDICTIONS].squeeze(
+                0).cpu().numpy()
+        predictions = torch_nested_numpify(torch_nested_detach(predictions))
+        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
 
-        pred_list = inputs['predictions']
-        labels = []
-        for pre in pred_list:
-            labels.append(self.id2label[pre])
-        labels = labels[1:-1]
+        labels = [self.id2label[x] for x in predictions]
+        if len(labels) > len(offset_mapping):
+            labels = labels[1:-1]
         chunks = []
-        tags = []
-        chunk = ''
-        assert len(inputs['text']) == len(labels)
-        for token, label in zip(inputs['text'], labels):
-            if label[0] == 'B' or label[0] == 'I':
-                chunk += token
-            else:
-                chunk += token
-                chunks.append(chunk)
-                chunk = ''
-                tags.append(label.split('-')[-1])
+        chunk = {}
+        for label, offsets in zip(labels, offset_mapping):
+            if label[0] in 'BS':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                chunk = {
+                    'type': label[2:],
+                    'start': offsets[0],
+                    'end': offsets[1]
+                }
+            if label[0] in 'IES':
+                if chunk:
+                    chunk['end'] = offsets[1]
+
+            if label[0] in 'ES':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                    chunk = {}
+
         if chunk:
+            chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
-            tags.append(label.split('-')[-1])
-        pos_result = []
-        seg_result = ' '.join(chunks)
-        for chunk, tag in zip(chunks, tags):
-            pos_result.append({OutputKeys.WORD: chunk, OutputKeys.LABEL: tag})
-        outputs = {
-            OutputKeys.OUTPUT: seg_result,
-            OutputKeys.LABELS: pos_result
-        }
+
+        # for cws output
+        if len(chunks) > 0 and chunks[0]['type'] == 'cws':
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = ' '.join(spans)
+            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+
+        # for ner outputs
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index eb7f7f74..68a03631 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -34,7 +34,8 @@ class TranslationPipeline(Pipeline):
     def __init__(self, model: Model, **kwargs):
         """Build a translation pipeline with a model dir or a model id in the model hub.
 
-        @param model: A Model instance.
+        Args:
+            model: A Model instance.
         """
         super().__init__(model=model, **kwargs)
         model = self.model.model_dir
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 9d4bb67f..0df8f1ad 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -12,6 +12,8 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (Preprocessor,
                                       TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
+from modelscope.utils.tensor_utils import (torch_nested_detach,
+                                           torch_nested_numpify)
 
 __all__ = ['WordSegmentationPipeline']
 
@@ -72,28 +74,56 @@ class WordSegmentationPipeline(Pipeline):
         """process the prediction results
 
         Args:
-            inputs (Dict[str, Any]): _description_
+            inputs (Dict[str, Any]): should be tensors from model
 
         Returns:
             Dict[str, str]: the prediction results
         """
-
-        pred_list = inputs['predictions']
-        labels = []
-        for pre in pred_list:
-            labels.append(self.id2label[pre])
-        labels = labels[1:-1]
+        text = inputs['text']
+        logits = inputs[OutputKeys.LOGITS]
+        predictions = torch.argmax(logits[0], dim=-1)
+        logits = torch_nested_numpify(torch_nested_detach(logits))
+        predictions = torch_nested_numpify(torch_nested_detach(predictions))
+        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
+
+        labels = [self.id2label[x] for x in predictions]
+        if len(labels) > len(offset_mapping):
+            labels = labels[1:-1]
         chunks = []
-        chunk = ''
-        assert len(inputs['text']) == len(labels)
-        for token, label in zip(inputs['text'], labels):
-            if label[0] == 'B' or label[0] == 'I':
-                chunk += token
-            else:
-                chunk += token
-                chunks.append(chunk)
-                chunk = ''
+        chunk = {}
+        for label, offsets in zip(labels, offset_mapping):
+            if label[0] in 'BS':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                chunk = {
+                    'type': label[2:],
+                    'start': offsets[0],
+                    'end': offsets[1]
+                }
+            if label[0] in 'IES':
+                if chunk:
+                    chunk['end'] = offsets[1]
+
+            if label[0] in 'ES':
+                if chunk:
+                    chunk['span'] = text[chunk['start']:chunk['end']]
+                    chunks.append(chunk)
+                    chunk = {}
+
         if chunk:
+            chunk['span'] = text[chunk['start']:chunk['end']]
             chunks.append(chunk)
-        seg_result = ' '.join(chunks)
-        return {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+
+        # for cws output
+        if len(chunks) > 0 and chunks[0]['type'] == 'cws':
+            spans = [
+                chunk['span'] for chunk in chunks if chunk['span'].strip()
+            ]
+            seg_result = ' '.join(spans)
+            outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
+
+        # for ner outpus
+        else:
+            outputs = {OutputKeys.OUTPUT: chunks}
+        return outputs
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index fc7051c7..88792b45 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -86,8 +86,7 @@ class ZeroShotClassificationPipeline(Pipeline):
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
-        with torch.no_grad():
-            return self.model(**inputs, **forward_params)
+        return self.model(**inputs, **forward_params)
 
     def postprocess(self,
                     inputs: Dict[str, Any],
@@ -99,7 +98,7 @@ class ZeroShotClassificationPipeline(Pipeline):
         Returns:
             Dict[str, Any]: the prediction results
         """
-        logits = inputs[OutputKeys.LOGITS]
+        logits = inputs[OutputKeys.LOGITS].cpu().numpy()
         if multi_label or len(candidate_labels) == 1:
             logits = logits[..., [self.contradiction_id, self.entailment_id]]
             scores = softmax(logits, axis=-1)[..., 1]
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 423b3f46..76c6d877 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -16,31 +16,20 @@ if TYPE_CHECKING:
     from .kws import WavToLists
     from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
     from .nlp import (
-        DocumentSegmentationPreprocessor,
-        FaqQuestionAnsweringPreprocessor,
-        FillMaskPoNetPreprocessor,
-        NLPPreprocessor,
-        NLPTokenizerPreprocessorBase,
-        TextRankingPreprocessor,
-        RelationExtractionPreprocessor,
-        SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor,
-        TokenClassificationPreprocessor,
-        TextErrorCorrectionPreprocessor,
-        TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor,
-        Tokenize,
+        DocumentSegmentationPreprocessor, FaqQuestionAnsweringPreprocessor,
+        FillMaskPoNetPreprocessor, NLPPreprocessor,
+        NLPTokenizerPreprocessorBase, TextRankingPreprocessor,
+        RelationExtractionPreprocessor, SentenceEmbeddingPreprocessor,
+        SequenceClassificationPreprocessor, TokenClassificationPreprocessor,
+        TextErrorCorrectionPreprocessor, TextGenerationPreprocessor,
+        Text2TextGenerationPreprocessor, Tokenize,
         WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor,
-        TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor,
-    )
-    from .space import (DialogIntentPredictionPreprocessor,
-                        DialogModelingPreprocessor,
-                        DialogStateTrackingPreprocessor)
+        ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor,
+        SentencePiecePreprocessor, DialogIntentPredictionPreprocessor,
+        DialogModelingPreprocessor, DialogStateTrackingPreprocessor,
+        ConversationalTextToSqlPreprocessor,
+        TableQuestionAnsweringPreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
-    from .star import ConversationalTextToSqlPreprocessor
-    from .space_T_cn import TableQuestionAnsweringPreprocessor
 
 else:
     _import_structure = {
@@ -58,30 +47,22 @@ else:
         'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
             'DocumentSegmentationPreprocessor',
-            'FaqQuestionAnsweringPreprocessor',
-            'FillMaskPoNetPreprocessor',
-            'NLPPreprocessor',
-            'NLPTokenizerPreprocessorBase',
-            'TextRankingPreprocessor',
-            'RelationExtractionPreprocessor',
+            'FaqQuestionAnsweringPreprocessor', 'FillMaskPoNetPreprocessor',
+            'NLPPreprocessor', 'NLPTokenizerPreprocessorBase',
+            'TextRankingPreprocessor', 'RelationExtractionPreprocessor',
             'SentenceEmbeddingPreprocessor',
             'SequenceClassificationPreprocessor',
             'TokenClassificationPreprocessor',
-            'TextErrorCorrectionPreprocessor',
-            'TextGenerationPreprocessor',
-            'Tokenize',
-            'Text2TextGenerationPreprocessor',
+            'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor',
+            'Tokenize', 'Text2TextGenerationPreprocessor',
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
-            'TextGenerationJiebaPreprocessor',
-            'SentencePiecePreprocessor',
-        ],
-        'space': [
+            'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
-            'DialogStateTrackingPreprocessor', 'InputFeatures'
+            'DialogStateTrackingPreprocessor',
+            'ConversationalTextToSqlPreprocessor',
+            'TableQuestionAnsweringPreprocessor'
         ],
-        'star': ['ConversationalTextToSqlPreprocessor'],
-        'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index 6360a907..c2716a13 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -1,15 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from copy import deepcopy
+from typing import Any, Dict, Optional, Sequence
 
-from modelscope.utils.constant import ModeKeys
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys, Tasks
+from modelscope.utils.hub import read_config, snapshot_download
+from modelscope.utils.logger import get_logger
+from .builder import build_preprocessor
+
+logger = get_logger(__name__)
 
 
 class Preprocessor(ABC):
 
-    def __init__(self, *args, **kwargs):
-        self._mode = ModeKeys.INFERENCE
+    def __init__(self, mode=ModeKeys.INFERENCE, *args, **kwargs):
+        self._mode = mode
         self.device = int(
             os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else None
         pass
@@ -25,3 +32,61 @@ class Preprocessor(ABC):
     @mode.setter
     def mode(self, value):
         self._mode = value
+
+    @classmethod
+    def from_pretrained(cls,
+                        model_name_or_path: str,
+                        revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                        cfg_dict: Config = None,
+                        preprocessor_mode=ModeKeys.INFERENCE,
+                        **kwargs):
+        """ Instantiate a model from local directory or remote model repo. Note
+        that when loading from remote, the model revision can be specified.
+        """
+        if not os.path.exists(model_name_or_path):
+            model_dir = snapshot_download(
+                model_name_or_path, revision=revision)
+        else:
+            model_dir = model_name_or_path
+        if cfg_dict is None:
+            cfg = read_config(model_dir)
+        else:
+            cfg = cfg_dict
+        task = cfg.task
+        if 'task' in kwargs:
+            task = kwargs.pop('task')
+        field_name = Tasks.find_field_by_task(task)
+        if not hasattr(cfg, 'preprocessor'):
+            logger.error('No preprocessor field found in cfg.')
+            return None
+
+        sub_key = 'train' if preprocessor_mode == ModeKeys.TRAIN else 'val'
+
+        if 'type' not in cfg.preprocessor:
+            if sub_key in cfg.preprocessor:
+                sub_cfg = getattr(cfg.preprocessor, sub_key)
+            else:
+                logger.error(
+                    f'No {sub_key} key and type key found in '
+                    f'preprocessor domain of configuration.json file.')
+                return None
+        else:
+            sub_cfg = cfg.preprocessor
+
+        if len(sub_cfg):
+            if isinstance(sub_cfg, Sequence):
+                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
+                # and add mode for Compose or other plans
+                raise NotImplementedError('Not supported yet!')
+            sub_cfg = deepcopy(sub_cfg)
+            sub_cfg.update({'model_dir': model_dir})
+            sub_cfg.update(kwargs)
+            preprocessor = build_preprocessor(sub_cfg, field_name)
+        else:
+            logger.error(
+                f'Cannot find available config to build preprocessor at mode {preprocessor_mode}, '
+                f'please check the preprocessor field in the configuration.json file.'
+            )
+            return None
+        preprocessor.mode = preprocessor_mode
+        return preprocessor
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index b95048ba..ea7b6bf4 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -5,50 +5,68 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .text_error_correction import TextErrorCorrectionPreprocessor
-    from .nlp_base import (
-        DocumentSegmentationPreprocessor,
-        FaqQuestionAnsweringPreprocessor,
-        FillMaskPoNetPreprocessor,
-        NLPPreprocessor,
-        NLPTokenizerPreprocessorBase,
-        TextRankingPreprocessor,
-        RelationExtractionPreprocessor,
-        SentenceEmbeddingPreprocessor,
-        SequenceClassificationPreprocessor,
-        TokenClassificationPreprocessor,
-        TextGenerationPreprocessor,
-        Text2TextGenerationPreprocessor,
-        Tokenize,
-        WordSegmentationBlankSetToLabelPreprocessor,
-        ZeroShotClassificationPreprocessor,
-        TextGenerationJiebaPreprocessor,
-        SentencePiecePreprocessor,
-    )
-
+    from .nlp_base import (NLPTokenizerPreprocessorBase, NLPBasePreprocessor)
+    from .text_generation_jieba_preprocessor import TextGenerationJiebaPreprocessor
+    from .sentence_piece_preprocessor import SentencePiecePreprocessor
+    from .bert_seq_cls_tokenizer import Tokenize
+    from .document_segmentation_preprocessor import DocumentSegmentationPreprocessor
+    from .faq_question_answering_preprocessor import FaqQuestionAnsweringPreprocessor
+    from .fill_mask_preprocessor import FillMaskPoNetPreprocessor, NLPPreprocessor
+    from .text_ranking_preprocessor import TextRankingPreprocessor
+    from .relation_extraction_preprocessor import RelationExtractionPreprocessor
+    from .sentence_classification_preprocessor import SequenceClassificationPreprocessor
+    from .sentence_embedding_preprocessor import SentenceEmbeddingPreprocessor
+    from .text_generation_preprocessor import TextGenerationPreprocessor
+    from .text2text_generation_preprocessor import Text2TextGenerationPreprocessor
+    from .token_classification_preprocessor import TokenClassificationPreprocessor, \
+        WordSegmentationBlankSetToLabelPreprocessor
+    from .zero_shot_classification_reprocessor import ZeroShotClassificationPreprocessor
+    from .space import (DialogIntentPredictionPreprocessor,
+                        DialogModelingPreprocessor,
+                        DialogStateTrackingPreprocessor, InputFeatures,
+                        MultiWOZBPETextField, IntentBPETextField)
+    from .space_T_en import ConversationalTextToSqlPreprocessor
+    from .space_T_cn import TableQuestionAnsweringPreprocessor
 else:
     _import_structure = {
         'nlp_base': [
-            'DocumentSegmentationPreprocessor',
-            'FaqQuestionAnsweringPreprocessor',
-            'FillMaskPoNetPreprocessor',
-            'NLPPreprocessor',
             'NLPTokenizerPreprocessorBase',
-            'TextRankingPreprocessor',
-            'RelationExtractionPreprocessor',
-            'SentenceEmbeddingPreprocessor',
-            'SequenceClassificationPreprocessor',
+            'NLPBasePreprocessor',
+        ],
+        'text_generation_jieba_preprocessor':
+        ['TextGenerationJiebaPreprocessor'],
+        'sentence_piece_preprocessor': ['SentencePiecePreprocessor'],
+        'bert_seq_cls_tokenizer': ['Tokenize'],
+        'document_segmentation_preprocessor':
+        ['DocumentSegmentationPreprocessor'],
+        'faq_question_answering_preprocessor':
+        ['FaqQuestionAnsweringPreprocessor'],
+        'fill_mask_preprocessor':
+        ['FillMaskPoNetPreprocessor', 'NLPPreprocessor'],
+        'text_ranking_preprocessor': ['TextRankingPreprocessor'],
+        'relation_extraction_preprocessor': ['RelationExtractionPreprocessor'],
+        'sentence_classification_preprocessor':
+        ['SequenceClassificationPreprocessor'],
+        'sentence_embedding_preprocessor': ['SentenceEmbeddingPreprocessor'],
+        'text_generation_preprocessor': ['TextGenerationPreprocessor'],
+        'text2text_generation_preprocessor':
+        ['Text2TextGenerationPreprocessor'],
+        'token_classification_preprocessor': [
             'TokenClassificationPreprocessor',
-            'TextGenerationPreprocessor',
-            'Tokenize',
-            'Text2TextGenerationPreprocessor',
-            'WordSegmentationBlankSetToLabelPreprocessor',
-            'ZeroShotClassificationPreprocessor',
-            'TextGenerationJiebaPreprocessor',
-            'SentencePiecePreprocessor',
+            'WordSegmentationBlankSetToLabelPreprocessor'
         ],
+        'zero_shot_classification_reprocessor':
+        ['ZeroShotClassificationPreprocessor'],
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
         ],
+        'space': [
+            'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
+            'DialogStateTrackingPreprocessor', 'InputFeatures',
+            'MultiWOZBPETextField', 'IntentBPETextField'
+        ],
+        'space_T_en': ['ConversationalTextToSqlPreprocessor'],
+        'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
     }
 
     import sys
diff --git a/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py b/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py
new file mode 100644
index 00000000..576687ce
--- /dev/null
+++ b/modelscope/preprocessors/nlp/bert_seq_cls_tokenizer.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from transformers import AutoTokenizer
+
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, InputFields
+
+
+@PREPROCESSORS.register_module(Fields.nlp)
+class Tokenize(Preprocessor):
+
+    def __init__(self, tokenizer_name) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(data, str):
+            data = {InputFields.text: data}
+        token_dict = self.tokenizer(data[InputFields.text])
+        data.update(token_dict)
+        return data
diff --git a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
new file mode 100644
index 00000000..5ab0a0c6
--- /dev/null
+++ b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
@@ -0,0 +1,220 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.logger import get_logger
+from .nlp_base import NLPBasePreprocessor
+
+logger = get_logger()
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.document_segmentation)
+class DocumentSegmentationPreprocessor(NLPBasePreprocessor):
+
+    def __init__(self, model_dir: str, config, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from transformers import BertTokenizerFast
+        self.tokenizer = BertTokenizerFast.from_pretrained(
+            model_dir,
+            use_fast=True,
+        )
+        self.question_column_name = 'labels'
+        self.context_column_name = 'sentences'
+        self.example_id_column_name = 'example_id'
+        self.label_to_id = {'B-EOP': 0, 'O': 1}
+        self.target_specical_ids = set()
+        self.target_specical_ids.add(self.tokenizer.eos_token_id)
+        self.max_seq_length = config.max_position_embeddings
+        self.label_list = ['B-EOP', 'O']
+
+    def __call__(self, examples) -> Dict[str, Any]:
+        questions = examples[self.question_column_name]
+        contexts = examples[self.context_column_name]
+        example_ids = examples[self.example_id_column_name]
+        num_examples = len(questions)
+
+        sentences = []
+        for sentence_list in contexts:
+            sentence_list = [_ + '[EOS]' for _ in sentence_list]
+            sentences.append(sentence_list)
+
+        try:
+            tokenized_examples = self.tokenizer(
+                sentences,
+                is_split_into_words=True,
+                add_special_tokens=False,
+                return_token_type_ids=True,
+                return_attention_mask=True,
+            )
+        except Exception as e:
+            logger.error(e)
+            return {}
+
+        segment_ids = []
+        token_seq_labels = []
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_labels = questions[example_index]
+            example_labels = [
+                self.label_to_id[_] if _ in self.label_to_id else -100
+                for _ in example_labels
+            ]
+            example_token_labels = []
+            segment_id = []
+            cur_seg_id = 1
+            for token_index in range(len(example_input_ids)):
+                if example_input_ids[token_index] in self.target_specical_ids:
+                    example_token_labels.append(example_labels[cur_seg_id - 1])
+                    segment_id.append(cur_seg_id)
+                    cur_seg_id += 1
+                else:
+                    example_token_labels.append(-100)
+                    segment_id.append(cur_seg_id)
+
+            segment_ids.append(segment_id)
+            token_seq_labels.append(example_token_labels)
+
+        tokenized_examples['segment_ids'] = segment_ids
+        tokenized_examples['token_seq_labels'] = token_seq_labels
+
+        new_segment_ids = []
+        new_token_seq_labels = []
+        new_input_ids = []
+        new_token_type_ids = []
+        new_attention_mask = []
+        new_example_ids = []
+        new_sentences = []
+
+        for example_index in range(num_examples):
+            example_input_ids = tokenized_examples['input_ids'][example_index]
+            example_token_type_ids = tokenized_examples['token_type_ids'][
+                example_index]
+            example_attention_mask = tokenized_examples['attention_mask'][
+                example_index]
+            example_segment_ids = tokenized_examples['segment_ids'][
+                example_index]
+            example_token_seq_labels = tokenized_examples['token_seq_labels'][
+                example_index]
+            example_sentences = contexts[example_index]
+            example_id = example_ids[example_index]
+            example_total_num_sentences = len(questions[example_index])
+            example_total_num_tokens = len(
+                tokenized_examples['input_ids'][example_index])
+            accumulate_length = [
+                i for i, x in enumerate(tokenized_examples['input_ids']
+                                        [example_index])
+                if x == self.tokenizer.eos_token_id
+            ]
+            samples_boundary = []
+            left_index = 0
+            sent_left_index = 0
+            sent_i = 0
+
+            # for sent_i, length in enumerate(accumulate_length):
+            while sent_i < len(accumulate_length):
+                length = accumulate_length[sent_i]
+                right_index = length + 1
+                sent_right_index = sent_i + 1
+                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
+                    samples_boundary.append([left_index, right_index])
+
+                    sample_input_ids = [
+                        self.tokenizer.cls_token_id
+                    ] + example_input_ids[left_index:right_index]
+                    sample_input_ids = sample_input_ids[:self.max_seq_length]
+
+                    sample_token_type_ids = [
+                        0
+                    ] + example_token_type_ids[left_index:right_index]
+                    sample_token_type_ids = sample_token_type_ids[:self.
+                                                                  max_seq_length]
+
+                    sample_attention_mask = [
+                        1
+                    ] + example_attention_mask[left_index:right_index]
+                    sample_attention_mask = sample_attention_mask[:self.
+                                                                  max_seq_length]
+
+                    sample_segment_ids = [
+                        0
+                    ] + example_segment_ids[left_index:right_index]
+                    sample_segment_ids = sample_segment_ids[:self.
+                                                            max_seq_length]
+
+                    sample_token_seq_labels = [
+                        -100
+                    ] + example_token_seq_labels[left_index:right_index]
+                    sample_token_seq_labels = sample_token_seq_labels[:self.
+                                                                      max_seq_length]
+
+                    if sent_right_index - 1 == sent_left_index:
+                        left_index = right_index
+                        sample_input_ids[-1] = self.tokenizer.eos_token_id
+                        sample_token_seq_labels[-1] = -100
+                    else:
+                        left_index = accumulate_length[sent_i - 1] + 1
+                        if sample_token_seq_labels[-1] != -100:
+                            sample_token_seq_labels[-1] = -100
+
+                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index]
+                        sent_left_index = sent_right_index
+                        sent_i += 1
+                    else:
+                        sample_sentences = example_sentences[
+                            sent_left_index:sent_right_index - 1]
+                        sent_left_index = sent_right_index - 1
+
+                    if (len([_ for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences) - 1 and (len([
+                                 _
+                                 for _ in sample_token_seq_labels if _ != -100
+                             ])) != len(sample_sentences):
+                        tmp = []
+                        for w_i, w, l in zip(
+                                sample_input_ids,
+                                self.tokenizer.decode(sample_input_ids).split(
+                                    ' '), sample_token_seq_labels):
+                            tmp.append((w_i, w, l))
+                    while len(sample_input_ids) < self.max_seq_length:
+                        sample_input_ids.append(self.tokenizer.pad_token_id)
+                        sample_token_type_ids.append(0)
+                        sample_attention_mask.append(0)
+                        sample_segment_ids.append(example_total_num_sentences
+                                                  + 1)
+                        sample_token_seq_labels.append(-100)
+
+                    new_input_ids.append(sample_input_ids)
+                    new_token_type_ids.append(sample_token_type_ids)
+                    new_attention_mask.append(sample_attention_mask)
+                    new_segment_ids.append(sample_segment_ids)
+                    new_token_seq_labels.append(sample_token_seq_labels)
+                    new_example_ids.append(example_id)
+                    new_sentences.append(sample_sentences)
+                else:
+                    sent_i += 1
+                    continue
+
+        output_samples = {}
+
+        output_samples['input_ids'] = new_input_ids
+        output_samples['token_type_ids'] = new_token_type_ids
+        output_samples['attention_mask'] = new_attention_mask
+
+        output_samples['segment_ids'] = new_segment_ids
+        output_samples['example_id'] = new_example_ids
+        output_samples['labels'] = new_token_seq_labels
+        output_samples['sentences'] = new_sentences
+
+        return output_samples
diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
new file mode 100644
index 00000000..72c8ed99
--- /dev/null
+++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
@@ -0,0 +1,90 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config, ConfigFields
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPBasePreprocessor
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
+class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super(FaqQuestionAnsweringPreprocessor, self).__init__(
+            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
+        from transformers import BertTokenizer
+        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+        preprocessor_config = Config.from_file(
+            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
+                ConfigFields.preprocessor, {})
+        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
+        self.label_dict = None
+
+    def pad(self, samples, max_len):
+        result = []
+        for sample in samples:
+            pad_len = max_len - len(sample[:max_len])
+            result.append(sample[:max_len]
+                          + [self.tokenizer.pad_token_id] * pad_len)
+        return result
+
+    def set_label_dict(self, label_dict):
+        self.label_dict = label_dict
+
+    def get_label(self, label_id):
+        assert self.label_dict is not None and label_id < len(self.label_dict)
+        return self.label_dict[label_id]
+
+    def encode_plus(self, text):
+        return [
+            self.tokenizer.cls_token_id
+        ] + self.tokenizer.convert_tokens_to_ids(
+            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
+
+    @type_assert(object, Dict)
+    def __call__(self, data: Dict[str, Any],
+                 **preprocessor_param) -> Dict[str, Any]:
+        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
+        queryset = data['query_set']
+        if not isinstance(queryset, list):
+            queryset = [queryset]
+        supportset = data['support_set']
+        supportset = sorted(supportset, key=lambda d: d['label'])
+
+        queryset_tokenized = [self.encode_plus(text) for text in queryset]
+        supportset_tokenized = [
+            self.encode_plus(item['text']) for item in supportset
+        ]
+
+        max_len = max(
+            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
+        max_len = min(TMP_MAX_LEN, max_len)
+        queryset_padded = self.pad(queryset_tokenized, max_len)
+        supportset_padded = self.pad(supportset_tokenized, max_len)
+
+        supportset_labels_ori = [item['label'] for item in supportset]
+        label_dict = []
+        for label in supportset_labels_ori:
+            if label not in label_dict:
+                label_dict.append(label)
+        self.set_label_dict(label_dict)
+        supportset_labels_ids = [
+            label_dict.index(label) for label in supportset_labels_ori
+        ]
+        return {
+            'query': queryset_padded,
+            'support': supportset_padded,
+            'support_labels': supportset_labels_ids
+        }
+
+    def batch_encode(self, sentence_list: list, max_length=None):
+        if not max_length:
+            max_length = self.MAX_LEN
+        return self.tokenizer.batch_encode_plus(
+            sentence_list, padding=True, max_length=max_length)
diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
new file mode 100644
index 00000000..b0638dbc
--- /dev/null
+++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
@@ -0,0 +1,142 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+import re
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import Fields, ModeKeys, ModelFile
+from modelscope.utils.nlp import import_external_nltk_data
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.feature_extraction)
+class NLPPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    @property
+    def mask_id(self):
+        return self.tokenizer.mask_token_id
+
+    def decode(self,
+               token_ids,
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        return self.tokenizer.decode(token_ids, skip_special_tokens,
+                                     clean_up_tokenization_spaces, **kwargs)
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
+class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in PoNet model's MLM task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     True)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+        self.cfg = Config.from_file(
+            osp.join(model_dir, ModelFile.CONFIGURATION))
+        self.language = self.cfg.model.get('language', 'en')
+        if self.language == 'en':
+            from nltk.tokenize import sent_tokenize
+            import_external_nltk_data(
+                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
+        elif self.language in ['zh', 'cn']:
+
+            def sent_tokenize(para):
+                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
+                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
+                              para)  # noqa *
+                para = para.rstrip()
+                return [_ for _ in para.split('\n') if _]
+        else:
+            raise NotImplementedError
+
+        self.sent_tokenize = sent_tokenize
+        self.max_length = kwargs['max_length']
+
+    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        text_a, text_b, labels = self.parse_text_and_label(data)
+        output = self.tokenizer(
+            text_a,
+            text_b,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+        max_seq_length = self.max_length
+
+        if text_b is None:
+            segment_ids = []
+            seg_lens = list(
+                map(
+                    len,
+                    self.tokenizer(
+                        self.sent_tokenize(text_a),
+                        add_special_tokens=False,
+                        truncation=True)['input_ids']))
+            segment_id = [0] + sum(
+                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
+            segment_id = segment_id[:max_seq_length - 1]
+            segment_ids.append(segment_id + [segment_id[-1] + 1]
+                               * (max_seq_length - len(segment_id)))
+            if self.mode == ModeKeys.INFERENCE:
+                segment_ids = torch.tensor(segment_ids)
+            output['segment_ids'] = segment_ids
+
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
+
+        self.labels_to_id(labels, output)
+        return output
+
+    @property
+    def mask_id(self):
+        return self.tokenizer.mask_token_id
+
+    def decode(self,
+               token_ids,
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        return self.tokenizer.decode(token_ids, skip_special_tokens,
+                                     clean_up_tokenization_spaces, **kwargs)
diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py
index 6075a4b3..48a04d7a 100644
--- a/modelscope/preprocessors/nlp/nlp_base.py
+++ b/modelscope/preprocessors/nlp/nlp_base.py
@@ -1,67 +1,41 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
-import os.path as osp
-import re
-from typing import Any, Dict, Optional, Tuple, Union
+from abc import ABC
+from collections.abc import Mapping
+from typing import Any, Dict, List, Tuple, Union
 
 import json
 import numpy as np
-import sentencepiece as spm
 import torch
 from transformers import AutoTokenizer
 
-from modelscope.metainfo import Models, Preprocessors
+from modelscope.metainfo import Models
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.config import Config, ConfigFields
-from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile
+from modelscope.utils.constant import ModeKeys
 from modelscope.utils.hub import get_model_type, parse_label_mapping
 from modelscope.utils.logger import get_logger
-from modelscope.utils.nlp import import_external_nltk_data
-from modelscope.utils.type_assert import type_assert
 
 logger = get_logger()
 
 __all__ = [
-    'DocumentSegmentationPreprocessor',
-    'FaqQuestionAnsweringPreprocessor',
-    'NLPPreprocessor',
-    'FillMaskPoNetPreprocessor',
+    'NLPBasePreprocessor',
     'NLPTokenizerPreprocessorBase',
-    'TextRankingPreprocessor',
-    'RelationExtractionPreprocessor',
-    'SentenceEmbeddingPreprocessor',
-    'SequenceClassificationPreprocessor',
-    'TokenClassificationPreprocessor',
-    'Text2TextGenerationPreprocessor',
-    'TextGenerationPreprocessor',
-    'Tokenize',
-    'WordSegmentationBlankSetToLabelPreprocessor',
-    'ZeroShotClassificationPreprocessor',
 ]
 
 
-@PREPROCESSORS.register_module(Fields.nlp)
-class Tokenize(Preprocessor):
-
-    def __init__(self, tokenizer_name) -> None:
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-
-    def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
-        if isinstance(data, str):
-            data = {InputFields.text: data}
-        token_dict = self.tokenizer(data[InputFields.text])
-        data.update(token_dict)
-        return data
-
-
-class NLPTokenizerPreprocessorBase(Preprocessor):
-
-    def __init__(self, model_dir: str, mode: str, **kwargs):
-        """The NLP tokenizer preprocessor base class.
+class NLPBasePreprocessor(Preprocessor, ABC):
 
-        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
+    def __init__(self,
+                 model_dir: str,
+                 first_sequence=None,
+                 second_sequence=None,
+                 label=None,
+                 label2id=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        """The NLP preprocessor base class.
 
         Args:
             model_dir (str): The local model path
@@ -71,18 +45,12 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
                 if this mapping is not supplied.
             mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
-            kwargs: These kwargs will be directly fed into the tokenizer.
         """
+        self.model_dir = model_dir
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        self.label = label
 
-        super().__init__(**kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self._mode = mode
-        self.label = kwargs.pop('label', OutputKeys.LABEL)
         self.use_fast = kwargs.pop('use_fast', None)
         if self.use_fast is None and os.path.isfile(
                 os.path.join(model_dir, 'tokenizer_config.json')):
@@ -92,15 +60,82 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                 self.use_fast = json_config.get('use_fast')
         self.use_fast = False if self.use_fast is None else self.use_fast
 
-        self.label2id = None
-        if 'label2id' in kwargs:
-            self.label2id = kwargs.pop('label2id')
+        self.label2id = label2id
         if self.label2id is None:
             self.label2id = parse_label_mapping(self.model_dir)
+        super().__init__(mode, **kwargs)
 
-        self.tokenize_kwargs = kwargs
+    @property
+    def mask_id(self):
+        """Child preprocessor can override this property to return the id of mask token.
 
+        Returns:
+            The id of mask token, default None.
+        """
+        return None
+
+    def decode(self,
+               token_ids: Union[int, List[int], 'np.ndarray', 'torch.Tensor',
+                                'tf.Tensor'],
+               skip_special_tokens: bool = False,
+               clean_up_tokenization_spaces: bool = True,
+               **kwargs):
+        """Turn the token_ids to real sentence.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            The real sentence decoded by the preprocessor.
+        """
+        raise NotImplementedError()
+
+
+class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 first_sequence: str = None,
+                 second_sequence: str = None,
+                 label: str = 'label',
+                 label2id: dict = None,
+                 mode: str = ModeKeys.INFERENCE,
+                 **kwargs):
+        """The NLP tokenizer preprocessor base class.
+
+        Any nlp preprocessor which uses the hf tokenizer can inherit from this class.
+
+        Args:
+            model_dir (str): The local model path
+            first_sequence: The key for the first sequence
+            second_sequence: The key for the second sequence
+            label: The key for the label
+            label2id: An optional label2id dict.
+                If label2id is None, the preprocessor will try to parse label-id mapping from:
+                - configuration.json model.label2id/model.id2label
+                - config.json label2id/id2label
+                - label_mapping.json
+            mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different.
+            kwargs: These kwargs will be directly fed into the tokenizer.
+        """
+
+        super().__init__(model_dir, first_sequence, second_sequence, label,
+                         label2id, mode)
+        self.model_dir = model_dir
+        self.tokenize_kwargs = kwargs
         self.tokenizer = self.build_tokenizer(model_dir)
+        logger.info(f'The key of sentence1: {self.first_sequence}, '
+                    f'The key of sentence2: {self.second_sequence}, '
+                    f'The key of label: {self.label}')
+        if self.first_sequence is None:
+            logger.warning('[Important] first_sequence attribute is not set, '
+                           'this will cause an error if your input is a dict.')
 
     @property
     def id2label(self):
@@ -118,8 +153,11 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         NOTE: This default implementation only returns slow tokenizer, because the fast tokenizers have a
         multi-thread problem.
 
-        @param model_dir:  The local model dir.
-        @return: The initialized tokenizer.
+        Args:
+            model_dir:  The local model dir.
+
+        Returns:
+            The initialized tokenizer.
         """
         self.is_transformer_based_model = 'lstm' not in model_dir
         # fast version lead to parallel inference failed
@@ -180,8 +218,11 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         If the pair param is False, data will be parsed as the first_sentence and the label,
         else it will be parsed as the first_sentence and the second_sentence.
 
-        @param data: The input data.
-        @return: The sentences and labels tuple.
+        Args:
+            data: The input data.
+
+        Returns:
+            The sentences and labels tuple.
         """
         text_a, text_b, labels = None, None, None
         if isinstance(data, str):
@@ -194,7 +235,7 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
                     text_a, text_b = data
                 else:
                     text_a, labels = data
-        elif isinstance(data, dict):
+        elif isinstance(data, Mapping):
             text_a = data.get(self.first_sequence)
             text_b = data.get(self.second_sequence)
             labels = data.get(self.label)
@@ -208,1007 +249,34 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         If the original label's type is float, or the label2id mapping does not exist,
         the original label will be returned.
 
-        @param labels: The input labels.
-        @param output: The label id.
-        @return: The final labels.
+        Args:
+            labels: The input labels.
+            output: The label id.
+
+        Returns:
+            The final labels.
         """
 
         def label_can_be_mapped(label):
             return isinstance(label, str) or isinstance(label, int)
 
-        if labels is not None:
+        try:
             if isinstance(labels, (tuple, list)) and all([label_can_be_mapped(label) for label in labels]) \
                     and self.label2id is not None:
                 output[OutputKeys.LABELS] = [
-                    self.label2id[str(label)] for label in labels
+                    self.label2id[label]
+                    if label in self.label2id else self.label2id[str(label)]
+                    for label in labels
                 ]
             elif label_can_be_mapped(labels) and self.label2id is not None:
-                output[OutputKeys.LABELS] = self.label2id[str(labels)]
-            else:
+                output[OutputKeys.LABELS] = self.label2id[
+                    labels] if labels in self.label2id else self.label2id[str(
+                        labels)]
+            elif labels is not None:
                 output[OutputKeys.LABELS] = labels
-
-
-@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.fill_mask)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.feature_extraction)
-class NLPPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_ranking)
-class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text-ranking model.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 mode=ModeKeys.INFERENCE,
-                 *args,
-                 **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        super().__init__(model_dir, pair=True, mode=mode, *args, **kwargs)
-        self.model_dir: str = model_dir
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'source_sentence')
-        self.second_sequence = kwargs.pop('second_sequence',
-                                          'sentences_to_compare')
-        self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
-
-    @type_assert(object, (str, tuple, Dict))
-    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
-        if isinstance(data, tuple):
-            sentence1, sentence2 = data
-        elif isinstance(data, dict):
-            sentence1 = data.get(self.first_sequence)
-            sentence2 = data.get(self.second_sequence)
-        if isinstance(sentence2, str):
-            sentence2 = [sentence2]
-        if isinstance(sentence1, str):
-            sentence1 = [sentence1]
-        sentence1 = sentence1 * len(sentence2)
-
-        max_seq_length = self.sequence_length
-        feature = self.tokenizer(
-            sentence1,
-            sentence2,
-            padding='max_length',
-            truncation=True,
-            max_length=max_seq_length,
-            return_tensors='pt')
-        if 'labels' in data:
-            labels = data['labels']
-            feature['labels'] = labels
-        if 'qid' in data:
-            qid = data['qid']
-            feature['qid'] = qid
-        return feature
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
-class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in sequence classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sentence_embedding)
-class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in sentence embedding.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data Dict:
-                keys: "source_sentence" && "sentences_to_compare"
-                values: list of sentences
-                Example:
-                    {"source_sentence": ["how long it take to get a master's degree"],
-                     "sentences_to_compare": ["On average, students take about 18 to 24 months
-                     to complete a master's degree.",
-                     "On the other hand, some students prefer to go at a slower pace
-                     and choose to take several years to complete their studies.",
-                     "It can take anywhere from two semesters"]}
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-        source_sentence = data['source_sentence']
-        compare_sentences = data['sentences_to_compare']
-        sentences = []
-        sentences.append(source_sentence[0])
-        for sent in compare_sentences:
-            sentences.append(sent)
-
-        tokenized_inputs = self.tokenizer(
-            sentences,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            padding=True,
-            truncation=True)
-        return tokenized_inputs
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
-class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in zero shot classification.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
-                 candidate_labels: list) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str or dict): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-        if isinstance(data, dict):
-            data = data.get(self.first_sequence)
-
-        pairs = [[data, hypothesis_template.format(label)]
-                 for label in candidate_labels]
-
-        features = self.tokenizer(
-            pairs,
-            padding=True,
-            truncation=True,
-            max_length=self.sequence_length,
-            truncation_strategy='only_first',
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
-        return features
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
-class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
-                 **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
-        kwargs['padding'] = kwargs.get('padding', False)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        text_a, _, _ = self.parse_text_and_label(data)
-
-        inputs = self.tokenizer(
-            text_a,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-
-        # This is produced by tokenizers but is an invalid generate kwargs
-        if 'token_type_ids' in inputs:
-            del inputs['token_type_ids']
-        return inputs
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
-class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self,
-                 model_dir: str,
-                 tokenizer=None,
-                 mode=ModeKeys.INFERENCE,
-                 **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     False)
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-    @staticmethod
-    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
-        import os
-        for name in os.listdir(model_dir):
-            full_name = os.path.join(model_dir, name)
-            if 'roberta' in name and os.path.isdir(full_name):
-                return full_name
-
-    def build_tokenizer(self, model_dir: str):
-        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
-        if roberta_tokenizer_dir:
-            from transformers import RobertaTokenizer
-            return RobertaTokenizer.from_pretrained(
-                roberta_tokenizer_dir, do_lower_case=False)
-        return super().build_tokenizer(model_dir)
-
-    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
-        if self._mode == ModeKeys.INFERENCE:
-            return super().__call__(data)
-        src_rst = super().__call__(data['src_txt'])
-        src_input_ids = src_rst['input_ids']
-        src_attention_mask = src_rst['attention_mask']
-        if 'tgt_txt' in data:
-            labels = super().__call__(data['tgt_txt'])['input_ids']
-        else:
-            labels = src_input_ids[1:]
-            src_input_ids = src_input_ids[:-1]
-            src_attention_mask = src_attention_mask[:-1]
-
-        return {
-            'input_ids': src_input_ids,
-            'attention_mask': src_attention_mask,
-            'labels': labels,
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
-class TextGenerationJiebaPreprocessor(Preprocessor):
-    """The jieba tokenizer preprocessor used in text generation.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
-        super().__init__(*args, **kwargs)
-        self.tokenizer = JiebaBPETokenizer(
-            osp.join(model_dir, 'tokenizer.json'))
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-            Example:
-            {'net_input':
-                {'src_tokens':tensor([1,2,3,4]),
-                'src_lengths': tensor([4])}
-            }
-        """
-        import torch
-
-        return {
-            'input_ids':
-            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp,
-    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
-class WordSegmentationBlankSetToLabelPreprocessor(Preprocessor):
-    """The preprocessor used to turn a single sentence to a labeled token-classification dict.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.first_sequence: str = kwargs.pop('first_sequence',
-                                              'first_sequence')
-        self.label = kwargs.pop('label', OutputKeys.LABELS)
-
-    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
-        data = data.split(' ')
-        data = list(filter(lambda x: len(x) > 0, data))
-
-        def produce_train_sample(words):
-            chars = []
-            labels = []
-            for word in words:
-                chars.extend(list(word))
-                if len(word) == 1:
-                    labels.append('S-CWS')
-                else:
-                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
-                                  + ['E-CWS'])
-            assert len(chars) == len(labels)
-            return chars, labels
-
-        chars, labels = produce_train_sample(data)
-        return {
-            self.first_sequence: chars,
-            self.label: labels,
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
-class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in normal NER task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get(
-            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
-        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
-        super().__init__(model_dir, mode=mode, **kwargs)
-
-        if 'is_split_into_words' in kwargs:
-            self.is_split_into_words = kwargs.pop('is_split_into_words')
-        else:
-            self.is_split_into_words = self.tokenizer.init_kwargs.get(
-                'is_split_into_words', False)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = None
-        labels_list = None
-        if isinstance(data, str):
-            text = data
-        elif isinstance(data, dict):
-            text = data.get(self.first_sequence)
-            labels_list = data.get(self.label)
-
-        input_ids = []
-        label_mask = []
-        offset_mapping = []
-        if self.is_split_into_words:
-            for offset, token in enumerate(list(data)):
-                subtoken_ids = self.tokenizer.encode(
-                    token, add_special_tokens=False)
-                if len(subtoken_ids) == 0:
-                    subtoken_ids = [self.tokenizer.unk_token_id]
-                input_ids.extend(subtoken_ids)
-                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
-                offset_mapping.extend([(offset, offset + 1)])
-        else:
-            if self.tokenizer.is_fast:
-                encodings = self.tokenizer(
-                    text,
-                    add_special_tokens=False,
-                    return_offsets_mapping=True,
-                    **self.tokenize_kwargs)
-                input_ids = encodings['input_ids']
-                word_ids = encodings.word_ids()
-                for i in range(len(word_ids)):
-                    if word_ids[i] is None:
-                        label_mask.append(0)
-                    elif word_ids[i] == word_ids[i - 1]:
-                        label_mask.append(0)
-                        offset_mapping[-1] = (
-                            offset_mapping[-1][0],
-                            encodings['offset_mapping'][i][1])
-                    else:
-                        label_mask.append(1)
-                        offset_mapping.append(encodings['offset_mapping'][i])
-            else:
-                encodings = self.tokenizer(
-                    text, add_special_tokens=False, **self.tokenize_kwargs)
-                input_ids = encodings['input_ids']
-                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
-                    text)
-
-        if len(input_ids) >= self.sequence_length - 2:
-            input_ids = input_ids[:self.sequence_length - 2]
-            label_mask = label_mask[:self.sequence_length - 2]
-        input_ids = [self.tokenizer.cls_token_id
-                     ] + input_ids + [self.tokenizer.sep_token_id]
-        label_mask = [0] + label_mask + [0]
-        attention_mask = [1] * len(input_ids)
-        offset_mapping = offset_mapping[:sum(label_mask)]
-
-        if not self.is_transformer_based_model:
-            input_ids = input_ids[1:-1]
-            attention_mask = attention_mask[1:-1]
-            label_mask = label_mask[1:-1]
-
-        if self._mode == ModeKeys.INFERENCE:
-            input_ids = torch.tensor(input_ids).unsqueeze(0)
-            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
-            label_mask = torch.tensor(
-                label_mask, dtype=torch.bool).unsqueeze(0)
-
-        # the token classification
-        output = {
-            'text': text,
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'label_mask': label_mask,
-            'offset_mapping': offset_mapping
-        }
-
-        # align the labels with tokenized text
-        if labels_list is not None:
-            assert self.label2id is not None
-            # Map that sends B-Xxx label to its I-Xxx counterpart
-            b_to_i_label = []
-            label_enumerate_values = [
-                k for k, v in sorted(
-                    self.label2id.items(), key=lambda item: item[1])
-            ]
-            for idx, label in enumerate(label_enumerate_values):
-                if label.startswith('B-') and label.replace(
-                        'B-', 'I-') in label_enumerate_values:
-                    b_to_i_label.append(
-                        label_enumerate_values.index(
-                            label.replace('B-', 'I-')))
-                else:
-                    b_to_i_label.append(idx)
-
-            label_row = [self.label2id[lb] for lb in labels_list]
-            previous_word_idx = None
-            label_ids = []
-            for word_idx in word_ids:
-                if word_idx is None:
-                    label_ids.append(-100)
-                elif word_idx != previous_word_idx:
-                    label_ids.append(label_row[word_idx])
-                else:
-                    if self.label_all_tokens:
-                        label_ids.append(b_to_i_label[label_row[word_idx]])
-                    else:
-                        label_ids.append(-100)
-                previous_word_idx = word_idx
-            labels = label_ids
-            output['labels'] = labels
-        return output
-
-    def get_tokenizer_class(self):
-        tokenizer_class = self.tokenizer.__class__.__name__
-        if tokenizer_class.endswith(
-                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
-            tokenizer_class = tokenizer_class[:-4]
-        return tokenizer_class
-
-    def get_label_mask_and_offset_mapping(self, text):
-        label_mask = []
-        offset_mapping = []
-        tokens = self.tokenizer.tokenize(text)
-        offset = 0
-        if self.get_tokenizer_class() == 'BertTokenizer':
-            for token in tokens:
-                is_start = (token[:2] != '##')
-                if is_start:
-                    label_mask.append(True)
-                else:
-                    token = token[2:]
-                    label_mask.append(False)
-                start = offset + text[offset:].index(token)
-                end = start + len(token)
-                if is_start:
-                    offset_mapping.append((start, end))
-                else:
-                    offset_mapping[-1] = (offset_mapping[-1][0], end)
-                offset = end
-        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
-            last_is_blank = False
-            for token in tokens:
-                is_start = (token[0] == '▁')
-                if is_start:
-                    token = token[1:]
-                    label_mask.append(True)
-                    if len(token) == 0:
-                        last_is_blank = True
-                        continue
-                else:
-                    label_mask.append(False)
-                start = offset + text[offset:].index(token)
-                end = start + len(token)
-                if last_is_blank or is_start:
-                    offset_mapping.append((start, end))
-                else:
-                    offset_mapping[-1] = (offset_mapping[-1][0], end)
-                offset = end
-                last_is_blank = False
-        else:
-            raise NotImplementedError
-
-        return label_mask, offset_mapping
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.re_tokenizer)
-class RelationExtractionPreprocessor(Preprocessor):
-    """The relation extraction preprocessor used in normal RE task.
-    """
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-
-        self.model_dir: str = model_dir
-        self.sequence_length = kwargs.pop('sequence_length', 512)
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=True)
-
-    @type_assert(object, str)
-    def __call__(self, data: str) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (str): a sentence
-                Example:
-                    'you are so handsome.'
-
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        # preprocess the data for the model input
-        text = data
-        output = self.tokenizer([text], return_tensors='pt')
-        return {
-            'text': text,
-            'input_ids': output['input_ids'],
-            'attention_mask': output['attention_mask'],
-            'offsets': output[0].offsets
-        }
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.faq_question_answering_preprocessor)
-class FaqQuestionAnsweringPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        super(FaqQuestionAnsweringPreprocessor, self).__init__(
-            model_dir, mode=ModeKeys.INFERENCE, **kwargs)
-        import os
-        from transformers import BertTokenizer
-
-        from modelscope.utils.config import Config
-        from modelscope.utils.constant import ModelFile
-        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
-        preprocessor_config = Config.from_file(
-            os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
-                ConfigFields.preprocessor, {})
-        self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
-        self.label_dict = None
-
-    def pad(self, samples, max_len):
-        result = []
-        for sample in samples:
-            pad_len = max_len - len(sample[:max_len])
-            result.append(sample[:max_len]
-                          + [self.tokenizer.pad_token_id] * pad_len)
-        return result
-
-    def set_label_dict(self, label_dict):
-        self.label_dict = label_dict
-
-    def get_label(self, label_id):
-        assert self.label_dict is not None and label_id < len(self.label_dict)
-        return self.label_dict[label_id]
-
-    def encode_plus(self, text):
-        return [
-            self.tokenizer.cls_token_id
-        ] + self.tokenizer.convert_tokens_to_ids(
-            self.tokenizer.tokenize(text)) + [self.tokenizer.sep_token_id]
-
-    @type_assert(object, Dict)
-    def __call__(self, data: Dict[str, Any],
-                 **preprocessor_param) -> Dict[str, Any]:
-        TMP_MAX_LEN = preprocessor_param.get('max_seq_length', self.MAX_LEN)
-        queryset = data['query_set']
-        if not isinstance(queryset, list):
-            queryset = [queryset]
-        supportset = data['support_set']
-        supportset = sorted(supportset, key=lambda d: d['label'])
-
-        queryset_tokenized = [self.encode_plus(text) for text in queryset]
-        supportset_tokenized = [
-            self.encode_plus(item['text']) for item in supportset
-        ]
-
-        max_len = max(
-            [len(seq) for seq in queryset_tokenized + supportset_tokenized])
-        max_len = min(TMP_MAX_LEN, max_len)
-        queryset_padded = self.pad(queryset_tokenized, max_len)
-        supportset_padded = self.pad(supportset_tokenized, max_len)
-
-        supportset_labels_ori = [item['label'] for item in supportset]
-        label_dict = []
-        for label in supportset_labels_ori:
-            if label not in label_dict:
-                label_dict.append(label)
-        self.set_label_dict(label_dict)
-        supportset_labels_ids = [
-            label_dict.index(label) for label in supportset_labels_ori
-        ]
-        return {
-            'query': queryset_padded,
-            'support': supportset_padded,
-            'support_labels': supportset_labels_ids
-        }
-
-    def batch_encode(self, sentence_list: list, max_length=None):
-        if not max_length:
-            max_length = self.MAX_LEN
-        return self.tokenizer.batch_encode_plus(
-            sentence_list, padding=True, max_length=max_length)
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.document_segmentation)
-class DocumentSegmentationPreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, config, *args, **kwargs):
-        """preprocess the data
-
-        Args:
-            model_dir (str): model path
-        """
-
-        super().__init__(*args, **kwargs)
-        from transformers import BertTokenizerFast
-        self.tokenizer = BertTokenizerFast.from_pretrained(
-            model_dir,
-            use_fast=True,
-        )
-        self.question_column_name = 'labels'
-        self.context_column_name = 'sentences'
-        self.example_id_column_name = 'example_id'
-        self.label_to_id = {'B-EOP': 0, 'O': 1}
-        self.target_specical_ids = set()
-        self.target_specical_ids.add(self.tokenizer.eos_token_id)
-        self.max_seq_length = config.max_position_embeddings
-        self.label_list = ['B-EOP', 'O']
-
-    def __call__(self, examples) -> Dict[str, Any]:
-        questions = examples[self.question_column_name]
-        contexts = examples[self.context_column_name]
-        example_ids = examples[self.example_id_column_name]
-        num_examples = len(questions)
-
-        sentences = []
-        for sentence_list in contexts:
-            sentence_list = [_ + '[EOS]' for _ in sentence_list]
-            sentences.append(sentence_list)
-
-        try:
-            tokenized_examples = self.tokenizer(
-                sentences,
-                is_split_into_words=True,
-                add_special_tokens=False,
-                return_token_type_ids=True,
-                return_attention_mask=True,
-            )
-        except Exception as e:
-            logger.error(e)
-            return {}
-
-        segment_ids = []
-        token_seq_labels = []
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_labels = questions[example_index]
-            example_labels = [
-                self.label_to_id[_] if _ in self.label_to_id else -100
-                for _ in example_labels
-            ]
-            example_token_labels = []
-            segment_id = []
-            cur_seg_id = 1
-            for token_index in range(len(example_input_ids)):
-                if example_input_ids[token_index] in self.target_specical_ids:
-                    example_token_labels.append(example_labels[cur_seg_id - 1])
-                    segment_id.append(cur_seg_id)
-                    cur_seg_id += 1
-                else:
-                    example_token_labels.append(-100)
-                    segment_id.append(cur_seg_id)
-
-            segment_ids.append(segment_id)
-            token_seq_labels.append(example_token_labels)
-
-        tokenized_examples['segment_ids'] = segment_ids
-        tokenized_examples['token_seq_labels'] = token_seq_labels
-
-        new_segment_ids = []
-        new_token_seq_labels = []
-        new_input_ids = []
-        new_token_type_ids = []
-        new_attention_mask = []
-        new_example_ids = []
-        new_sentences = []
-
-        for example_index in range(num_examples):
-            example_input_ids = tokenized_examples['input_ids'][example_index]
-            example_token_type_ids = tokenized_examples['token_type_ids'][
-                example_index]
-            example_attention_mask = tokenized_examples['attention_mask'][
-                example_index]
-            example_segment_ids = tokenized_examples['segment_ids'][
-                example_index]
-            example_token_seq_labels = tokenized_examples['token_seq_labels'][
-                example_index]
-            example_sentences = contexts[example_index]
-            example_id = example_ids[example_index]
-            example_total_num_sentences = len(questions[example_index])
-            example_total_num_tokens = len(
-                tokenized_examples['input_ids'][example_index])
-            accumulate_length = [
-                i for i, x in enumerate(tokenized_examples['input_ids']
-                                        [example_index])
-                if x == self.tokenizer.eos_token_id
-            ]
-            samples_boundary = []
-            left_index = 0
-            sent_left_index = 0
-            sent_i = 0
-
-            # for sent_i, length in enumerate(accumulate_length):
-            while sent_i < len(accumulate_length):
-                length = accumulate_length[sent_i]
-                right_index = length + 1
-                sent_right_index = sent_i + 1
-                if right_index - left_index >= self.max_seq_length - 1 or right_index == example_total_num_tokens:
-                    samples_boundary.append([left_index, right_index])
-
-                    sample_input_ids = [
-                        self.tokenizer.cls_token_id
-                    ] + example_input_ids[left_index:right_index]
-                    sample_input_ids = sample_input_ids[:self.max_seq_length]
-
-                    sample_token_type_ids = [
-                        0
-                    ] + example_token_type_ids[left_index:right_index]
-                    sample_token_type_ids = sample_token_type_ids[:self.
-                                                                  max_seq_length]
-
-                    sample_attention_mask = [
-                        1
-                    ] + example_attention_mask[left_index:right_index]
-                    sample_attention_mask = sample_attention_mask[:self.
-                                                                  max_seq_length]
-
-                    sample_segment_ids = [
-                        0
-                    ] + example_segment_ids[left_index:right_index]
-                    sample_segment_ids = sample_segment_ids[:self.
-                                                            max_seq_length]
-
-                    sample_token_seq_labels = [
-                        -100
-                    ] + example_token_seq_labels[left_index:right_index]
-                    sample_token_seq_labels = sample_token_seq_labels[:self.
-                                                                      max_seq_length]
-
-                    if sent_right_index - 1 == sent_left_index:
-                        left_index = right_index
-                        sample_input_ids[-1] = self.tokenizer.eos_token_id
-                        sample_token_seq_labels[-1] = -100
-                    else:
-                        left_index = accumulate_length[sent_i - 1] + 1
-                        if sample_token_seq_labels[-1] != -100:
-                            sample_token_seq_labels[-1] = -100
-
-                    if sent_right_index - 1 == sent_left_index or right_index == example_total_num_tokens:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index]
-                        sent_left_index = sent_right_index
-                        sent_i += 1
-                    else:
-                        sample_sentences = example_sentences[
-                            sent_left_index:sent_right_index - 1]
-                        sent_left_index = sent_right_index - 1
-
-                    if (len([_ for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences) - 1 and (len([
-                                 _
-                                 for _ in sample_token_seq_labels if _ != -100
-                             ])) != len(sample_sentences):
-                        tmp = []
-                        for w_i, w, l in zip(
-                                sample_input_ids,
-                                self.tokenizer.decode(sample_input_ids).split(
-                                    ' '), sample_token_seq_labels):
-                            tmp.append((w_i, w, l))
-                    while len(sample_input_ids) < self.max_seq_length:
-                        sample_input_ids.append(self.tokenizer.pad_token_id)
-                        sample_token_type_ids.append(0)
-                        sample_attention_mask.append(0)
-                        sample_segment_ids.append(example_total_num_sentences
-                                                  + 1)
-                        sample_token_seq_labels.append(-100)
-
-                    new_input_ids.append(sample_input_ids)
-                    new_token_type_ids.append(sample_token_type_ids)
-                    new_attention_mask.append(sample_attention_mask)
-                    new_segment_ids.append(sample_segment_ids)
-                    new_token_seq_labels.append(sample_token_seq_labels)
-                    new_example_ids.append(example_id)
-                    new_sentences.append(sample_sentences)
-                else:
-                    sent_i += 1
-                    continue
-
-        output_samples = {}
-
-        output_samples['input_ids'] = new_input_ids
-        output_samples['token_type_ids'] = new_token_type_ids
-        output_samples['attention_mask'] = new_attention_mask
-
-        output_samples['segment_ids'] = new_segment_ids
-        output_samples['example_id'] = new_example_ids
-        output_samples['labels'] = new_token_seq_labels
-        output_samples['sentences'] = new_sentences
-
-        return output_samples
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.fill_mask_ponet)
-class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase):
-    """The tokenizer preprocessor used in MLM task.
-    """
-
-    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        kwargs['truncation'] = kwargs.get('truncation', True)
-        kwargs['padding'] = kwargs.get('padding', 'max_length')
-        kwargs['max_length'] = kwargs.pop('sequence_length', 512)
-        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
-                                                     True)
-        super().__init__(model_dir, pair=False, mode=mode, **kwargs)
-
-        self.cfg = Config.from_file(
-            osp.join(model_dir, ModelFile.CONFIGURATION))
-        self.language = self.cfg.model.get('language', 'en')
-        if self.language == 'en':
-            from nltk.tokenize import sent_tokenize
-            import_external_nltk_data(
-                osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt')
-        elif self.language in ['zh', 'cn']:
-
-            def sent_tokenize(para):
-                para = re.sub(r'([。！!？\?])([^”’])', r'\1\n\2', para)  # noqa *
-                para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para)  # noqa *
-                para = re.sub(r'(\…{2})([^”’])', r'\1\n\2', para)  # noqa *
-                para = re.sub(r'([。！？\?][”’])([^，。！？\?])', r'\1\n\2',
-                              para)  # noqa *
-                para = para.rstrip()
-                return [_ for _ in para.split('\n') if _]
-        else:
-            raise NotImplementedError
-
-        self.sent_tokenize = sent_tokenize
-        self.max_length = kwargs['max_length']
-
-    def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
-        """process the raw input data
-
-        Args:
-            data (tuple): [sentence1, sentence2]
-                sentence1 (str): a sentence
-                    Example:
-                        'you are so handsome.'
-                sentence2 (str): a sentence
-                    Example:
-                        'you are so beautiful.'
-        Returns:
-            Dict[str, Any]: the preprocessed data
-        """
-
-        text_a, text_b, labels = self.parse_text_and_label(data)
-        output = self.tokenizer(
-            text_a,
-            text_b,
-            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
-            **self.tokenize_kwargs)
-        max_seq_length = self.max_length
-
-        if text_b is None:
-            segment_ids = []
-            seg_lens = list(
-                map(
-                    len,
-                    self.tokenizer(
-                        self.sent_tokenize(text_a),
-                        add_special_tokens=False,
-                        truncation=True)['input_ids']))
-            segment_id = [0] + sum(
-                [[i] * sl for i, sl in enumerate(seg_lens, start=1)], [])
-            segment_id = segment_id[:max_seq_length - 1]
-            segment_ids.append(segment_id + [segment_id[-1] + 1]
-                               * (max_seq_length - len(segment_id)))
-            output['segment_ids'] = segment_ids
-
-        output = {
-            k: np.array(v) if isinstance(v, list) else v
-            for k, v in output.items()
-        }
-
-        self.labels_to_id(labels, output)
-        return output
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sentence_piece)
-class SentencePiecePreprocessor(Preprocessor):
-
-    def __init__(self, model_dir: str, *args, **kwargs):
-        import os
-
-        super().__init__(*args, **kwargs)
-        self.tokenizer = None
-        for file_name in os.listdir(model_dir):
-            if file_name.endswith('.model'):
-                m_file = osp.join(model_dir, file_name)
-                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
-                break
-        assert self.tokenizer is not None, 'Can not find .model file'
-
-    def __call__(self, data: str) -> Dict[str, Any]:
-        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
+        except KeyError as e:
+            logger.error(
+                f'Label {labels} cannot be found in the label mapping {self.label2id},'
+                f'which comes from the user input or the configuration files. '
+                f'Please consider matching your labels with this mapping.')
+            raise e
diff --git a/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
new file mode 100644
index 00000000..9a426ab7
--- /dev/null
+++ b/modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
@@ -0,0 +1,55 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPBasePreprocessor
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.re_tokenizer)
+class RelationExtractionPreprocessor(NLPBasePreprocessor):
+    """The relation extraction preprocessor used in normal RE task.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=True)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = data
+        output = self.tokenizer([text], return_tensors='pt')
+        return {
+            'text': text,
+            'input_ids': output['input_ids'],
+            'attention_mask': output['attention_mask'],
+            'offsets': output[0].offsets
+        }
diff --git a/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
new file mode 100644
index 00000000..f1295c50
--- /dev/null
+++ b/modelscope/preprocessors/nlp/sentence_classification_preprocessor.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_sim_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SequenceClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sequence classification.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
new file mode 100644
index 00000000..519de60c
--- /dev/null
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_embedding)
+class SentenceEmbeddingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in sentence embedding.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict:
+                keys: "source_sentence" && "sentences_to_compare"
+                values: list of sentences
+                Example:
+                    {"source_sentence": ["how long it take to get a master's degree"],
+                     "sentences_to_compare": ["On average, students take about 18 to 24 months
+                     to complete a master's degree.",
+                     "On the other hand, some students prefer to go at a slower pace
+                     and choose to take several years to complete their studies.",
+                     "It can take anywhere from two semesters"]}
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        source_sentence = data['source_sentence']
+        compare_sentences = data['sentences_to_compare']
+        sentences = []
+        sentences.append(source_sentence[0])
+        for sent in compare_sentences:
+            sentences.append(sent)
+
+        tokenized_inputs = self.tokenizer(
+            sentences,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            padding=True,
+            truncation=True)
+        return tokenized_inputs
diff --git a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
new file mode 100644
index 00000000..1d1ef19d
--- /dev/null
+++ b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+import sentencepiece as spm
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_piece)
+class SentencePiecePreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        import os
+
+        super().__init__(*args, **kwargs)
+        self.tokenizer = None
+        for file_name in os.listdir(model_dir):
+            if file_name.endswith('.model'):
+                m_file = osp.join(model_dir, file_name)
+                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
+                break
+        assert self.tokenizer is not None, 'Can not find .model file'
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
diff --git a/modelscope/preprocessors/space/__init__.py b/modelscope/preprocessors/nlp/space/__init__.py
similarity index 100%
rename from modelscope/preprocessors/space/__init__.py
rename to modelscope/preprocessors/nlp/space/__init__.py
diff --git a/modelscope/preprocessors/space/args.py b/modelscope/preprocessors/nlp/space/args.py
similarity index 97%
rename from modelscope/preprocessors/space/args.py
rename to modelscope/preprocessors/nlp/space/args.py
index d9e91e74..17c6828b 100644
--- a/modelscope/preprocessors/space/args.py
+++ b/modelscope/preprocessors/nlp/space/args.py
@@ -1,7 +1,4 @@
-"""
-Parse argument.
-"""
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
 
 import json
diff --git a/modelscope/preprocessors/space/batch.py b/modelscope/preprocessors/nlp/space/batch.py
similarity index 96%
rename from modelscope/preprocessors/space/batch.py
rename to modelscope/preprocessors/nlp/space/batch.py
index fe0ad0ec..d27776f5 100644
--- a/modelscope/preprocessors/space/batch.py
+++ b/modelscope/preprocessors/nlp/space/batch.py
@@ -1,3 +1,6 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
 def batch(reader, batch_size, drop_last=False):
     """
     This operator creates a batched reader which combines the data from the
diff --git a/modelscope/preprocessors/space/data_loader.py b/modelscope/preprocessors/nlp/space/data_loader.py
similarity index 87%
rename from modelscope/preprocessors/space/data_loader.py
rename to modelscope/preprocessors/nlp/space/data_loader.py
index bd04a79c..290b64f3 100644
--- a/modelscope/preprocessors/space/data_loader.py
+++ b/modelscope/preprocessors/nlp/space/data_loader.py
@@ -1,18 +1,16 @@
-"""
-DataLoader class
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import math
 import os
 
 import numpy as np
 
-from modelscope.preprocessors.space.args import str2bool
-from modelscope.preprocessors.space.batch import batch
-from modelscope.preprocessors.space.lazy_dataset import LazyDataset
-from modelscope.preprocessors.space.sampler import (RandomSampler,
-                                                    SequentialSampler,
-                                                    SortedSampler)
+from modelscope.preprocessors.nlp.space.args import str2bool
+from modelscope.preprocessors.nlp.space.batch import batch
+from modelscope.preprocessors.nlp.space.lazy_dataset import LazyDataset
+from modelscope.preprocessors.nlp.space.sampler import (RandomSampler,
+                                                        SequentialSampler,
+                                                        SortedSampler)
 
 
 def get_data_loader(batch_size, reader, hparams, file, collate_fn, is_test):
diff --git a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
similarity index 64%
rename from modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
rename to modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
index e2602eaa..2923157e 100644
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_intent_prediction_preprocessor.py
@@ -8,8 +8,7 @@ import json
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.space.fields.intent_field import \
-    IntentBPETextField
+from modelscope.preprocessors.nlp import IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
@@ -47,10 +46,25 @@ class DialogIntentPredictionPreprocessor(Preprocessor):
         Args:
             data (str): a sentence
                 Example:
-                    'you are so handsome.'
+                    'What do I need to do for the card activation?'
 
         Returns:
             Dict[str, Any]: the preprocessed data
+                Example:
+                    {
+                        'src_token': array([[13,  2054,  2079,  1045...]]),
+                        'src_pos': array([[ 0,  1,  2,  3...]]),
+                        'src_type': array([[1, 1, 1, 1...]]),
+                        'src_turn': array([[1, 1, 1, 1...]]),
+                        'src_mask': array([[1, 1, 1, 1...]]),
+                        'mlm_token': array([[13,  2054,  2079,  1045...]]),
+                        'mlm_label': array([[0, 0, 0, 0...]]),
+                        'mlm_mask': array([[0, 0, 0, 0...]]),
+                        'tgt_token': array([[29, 30, 31, 32...]]),
+                        'tgt_mask': array([[1, 1, 1, 1...]]),
+                        'ids': array([0]),
+                        'intent_label': array([-1])
+                    }
         """
         samples = self.text_field.preprocessor([data])
         samples, _ = self.text_field.collate_fn_multi_turn(samples)
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py
similarity index 75%
rename from modelscope/preprocessors/space/dialog_modeling_preprocessor.py
rename to modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py
index c461ade1..ae3c214a 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_modeling_preprocessor.py
@@ -6,8 +6,7 @@ from typing import Any, Dict
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.space.fields.gen_field import \
-    MultiWOZBPETextField
+from modelscope.preprocessors.nlp import MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
@@ -42,9 +41,19 @@ class DialogModelingPreprocessor(Preprocessor):
         """process the raw input data
 
         Args:
-            data (str): a sentence
+            data (Dict[str, Any]): A sentence and dialogue history info.
                 Example:
-                    'you are so handsome.'
+                    {
+                        'user_input': 'i want to leave after 17:15 .',
+                        'history': {
+                            'labels': [[13, 1045, 2052, 2066...]],
+                            'resp': [14, 1045, 2064, 2393...],
+                            'bspn': [15, 43, 7688, 10733...],
+                            'db': [19, 24, 20],
+                            'aspn': [16, 43, 48, 2681, 7180, 10],
+                            'output': ['i', 'can', 'help', 'with'...]
+                        }
+                    }
 
         Returns:
             Dict[str, Any]: the preprocessed data
diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py
similarity index 92%
rename from modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
rename to modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py
index 6eb17288..cff39577 100644
--- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space/dialog_state_tracking_preprocessor.py
@@ -31,13 +31,17 @@ class DialogStateTrackingPreprocessor(Preprocessor):
         self.processor = multiwoz22Processor()
 
     @type_assert(object, dict)
-    def __call__(self, data: Dict) -> Dict[str, Any]:
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
-            data (str): a sentence
+            data (Dict[str, Any]): a sentence
                 Example:
-                    'you are so handsome.'
+                    {
+                        'utter': {'User-1': "Hi, I'm looking for a train that is going"
+                            "to cambridge and arriving there by 20:45, is there anything like that?"},
+                        'history_states': [{}]
+                    }
 
         Returns:
             Dict[str, Any]: the preprocessed data
diff --git a/modelscope/preprocessors/space/dst_processors.py b/modelscope/preprocessors/nlp/space/dst_processors.py
similarity index 100%
rename from modelscope/preprocessors/space/dst_processors.py
rename to modelscope/preprocessors/nlp/space/dst_processors.py
diff --git a/modelscope/preprocessors/nlp/space/fields/__init__.py b/modelscope/preprocessors/nlp/space/fields/__init__.py
new file mode 100644
index 00000000..475a99dc
--- /dev/null
+++ b/modelscope/preprocessors/nlp/space/fields/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .gen_field import MultiWOZBPETextField
+    from .intent_field import IntentBPETextField
+else:
+    _import_structure = {
+        'gen_field': ['MultiWOZBPETextField'],
+        'intent_field': ['IntentBPETextField']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/nlp/space/fields/gen_field.py
similarity index 99%
rename from modelscope/preprocessors/space/fields/gen_field.py
rename to modelscope/preprocessors/nlp/space/fields/gen_field.py
index 32346bd5..1d1879fe 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/nlp/space/fields/gen_field.py
@@ -9,7 +9,7 @@ from itertools import chain
 import json
 import numpy as np
 
-from modelscope.preprocessors.space.tokenizer import Tokenizer
+from modelscope.preprocessors.nlp.space.tokenizer import Tokenizer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology, utils
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/nlp/space/fields/intent_field.py
similarity index 99%
rename from modelscope/preprocessors/space/fields/intent_field.py
rename to modelscope/preprocessors/nlp/space/fields/intent_field.py
index 6d3b5fff..29ea915e 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/nlp/space/fields/intent_field.py
@@ -13,7 +13,7 @@ import json
 import numpy as np
 from tqdm import tqdm
 
-from modelscope.preprocessors.space.tokenizer import Tokenizer
+from modelscope.preprocessors.nlp.space.tokenizer import Tokenizer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.nlp.space import ontology
 from modelscope.utils.nlp.space.scores import hierarchical_set_score
diff --git a/modelscope/preprocessors/space/lazy_dataset.py b/modelscope/preprocessors/nlp/space/lazy_dataset.py
similarity index 93%
rename from modelscope/preprocessors/space/lazy_dataset.py
rename to modelscope/preprocessors/nlp/space/lazy_dataset.py
index 8da21db7..536d9341 100644
--- a/modelscope/preprocessors/space/lazy_dataset.py
+++ b/modelscope/preprocessors/nlp/space/lazy_dataset.py
@@ -1,11 +1,6 @@
-"""
-Dataset class
-"""
-
+# Copyright (c) Alibaba, Inc. and its affiliates.
 import json
 
-from modelscope.preprocessors.space.args import str2bool
-
 
 class LazyDataset(object):
     """
diff --git a/modelscope/preprocessors/space/preprocess.py b/modelscope/preprocessors/nlp/space/preprocess.py
similarity index 92%
rename from modelscope/preprocessors/space/preprocess.py
rename to modelscope/preprocessors/nlp/space/preprocess.py
index bd8d64d1..8aab4711 100644
--- a/modelscope/preprocessors/space/preprocess.py
+++ b/modelscope/preprocessors/nlp/space/preprocess.py
@@ -1,12 +1,9 @@
-"""
-Preprocess script.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import glob
 import os
 
-from modelscope.preprocessors.space.args import parse_args
-from modelscope.preprocessors.space.fields.intent_field import \
+from modelscope.preprocessors.nlp.space.fields.intent_field import \
     IntentBPETextField
 
 FILE_NAME = 'train.json'
diff --git a/modelscope/preprocessors/space/sampler.py b/modelscope/preprocessors/nlp/space/sampler.py
similarity index 96%
rename from modelscope/preprocessors/space/sampler.py
rename to modelscope/preprocessors/nlp/space/sampler.py
index 49a216d1..e549c343 100644
--- a/modelscope/preprocessors/space/sampler.py
+++ b/modelscope/preprocessors/nlp/space/sampler.py
@@ -1,6 +1,4 @@
-"""
-Sampler class.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import numpy as np
 
diff --git a/modelscope/preprocessors/space/tensorlistdataset.py b/modelscope/preprocessors/nlp/space/tensorlistdataset.py
similarity index 100%
rename from modelscope/preprocessors/space/tensorlistdataset.py
rename to modelscope/preprocessors/nlp/space/tensorlistdataset.py
diff --git a/modelscope/preprocessors/space/tokenizer.py b/modelscope/preprocessors/nlp/space/tokenizer.py
similarity index 99%
rename from modelscope/preprocessors/space/tokenizer.py
rename to modelscope/preprocessors/nlp/space/tokenizer.py
index 87f7e8c3..1bd0ce11 100644
--- a/modelscope/preprocessors/space/tokenizer.py
+++ b/modelscope/preprocessors/nlp/space/tokenizer.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import collections
diff --git a/modelscope/preprocessors/space_T_cn/__init__.py b/modelscope/preprocessors/nlp/space_T_cn/__init__.py
similarity index 100%
rename from modelscope/preprocessors/space_T_cn/__init__.py
rename to modelscope/preprocessors/nlp/space_T_cn/__init__.py
diff --git a/modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py b/modelscope/preprocessors/nlp/space_T_cn/fields/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/preprocessors/space_T_cn/fields/database.py b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
similarity index 98%
rename from modelscope/preprocessors/space_T_cn/fields/database.py
rename to modelscope/preprocessors/nlp/space_T_cn/fields/database.py
index 7ae38ee2..2fef8d7e 100644
--- a/modelscope/preprocessors/space_T_cn/fields/database.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/fields/database.py
@@ -4,7 +4,7 @@ import sqlite3
 import json
 import tqdm
 
-from modelscope.preprocessors.space_T_cn.fields.struct import Trie
+from .struct import Trie
 
 
 class Database:
diff --git a/modelscope/preprocessors/space_T_cn/fields/schema_link.py b/modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py
similarity index 99%
rename from modelscope/preprocessors/space_T_cn/fields/schema_link.py
rename to modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py
index 4b8f9d31..b62d03e4 100644
--- a/modelscope/preprocessors/space_T_cn/fields/schema_link.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/fields/schema_link.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
 
-from modelscope.preprocessors.space_T_cn.fields.struct import TypeInfo
+from .struct import TypeInfo
 
 
 class SchemaLinker:
diff --git a/modelscope/preprocessors/space_T_cn/fields/struct.py b/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
similarity index 100%
rename from modelscope/preprocessors/space_T_cn/fields/struct.py
rename to modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
diff --git a/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py
similarity index 96%
rename from modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
rename to modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py
index 63e6fd57..3aabc6a9 100644
--- a/modelscope/preprocessors/space_T_cn/table_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/table_question_answering_preprocessor.py
@@ -8,8 +8,9 @@ from transformers import BertTokenizer
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.space_T_cn.fields.database import Database
-from modelscope.preprocessors.space_T_cn.fields.schema_link import SchemaLinker
+from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database
+from modelscope.preprocessors.nlp.space_T_cn.fields.schema_link import \
+    SchemaLinker
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
diff --git a/modelscope/preprocessors/star/__init__.py b/modelscope/preprocessors/nlp/space_T_en/__init__.py
similarity index 100%
rename from modelscope/preprocessors/star/__init__.py
rename to modelscope/preprocessors/nlp/space_T_en/__init__.py
diff --git a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
similarity index 84%
rename from modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
rename to modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
index b5dd73a9..00c7bcd7 100644
--- a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
+++ b/modelscope/preprocessors/nlp/space_T_en/conversational_text_to_sql_preprocessor.py
@@ -12,9 +12,10 @@ from text2sql_lgesql.utils.example import Example
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.preprocessors.star.fields.preprocess_dataset import \
+from modelscope.preprocessors.nlp.space_T_en.fields import SubPreprocessor
+from modelscope.preprocessors.nlp.space_T_en.fields.preprocess_dataset import \
     preprocess_dataset
-from modelscope.preprocessors.star.fields.process_dataset import (
+from modelscope.preprocessors.nlp.space_T_en.fields.process_dataset import (
     process_dataset, process_tables)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, ModelFile
@@ -56,6 +57,18 @@ class ConversationalTextToSqlPreprocessor(Preprocessor):
             model_dir=self.model_dir,
             db_dir=os.path.join(model_dir, 'db'))
 
+        self.device = 'cuda' if \
+            ('device' not in kwargs or kwargs['device'] == 'gpu') \
+            and torch.cuda.is_available() else 'cpu'
+        use_device = True if self.device == 'cuda' else False
+        self.processor = \
+            SubPreprocessor(model_dir=model_dir,
+                            db_content=True,
+                            use_gpu=use_device)
+        self.output_tables = \
+            process_tables(self.processor,
+                           self.tables)
+
     @type_assert(object, dict)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """process the raw input data
diff --git a/modelscope/preprocessors/star/fields/__init__.py b/modelscope/preprocessors/nlp/space_T_en/fields/__init__.py
similarity index 100%
rename from modelscope/preprocessors/star/fields/__init__.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/__init__.py
diff --git a/modelscope/preprocessors/star/fields/common_utils.py b/modelscope/preprocessors/nlp/space_T_en/fields/common_utils.py
similarity index 100%
rename from modelscope/preprocessors/star/fields/common_utils.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/common_utils.py
diff --git a/modelscope/preprocessors/star/fields/parse.py b/modelscope/preprocessors/nlp/space_T_en/fields/parse.py
similarity index 100%
rename from modelscope/preprocessors/star/fields/parse.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/parse.py
diff --git a/modelscope/preprocessors/star/fields/preprocess_dataset.py b/modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py
similarity index 95%
rename from modelscope/preprocessors/star/fields/preprocess_dataset.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py
index 6c84c0e7..a0fd13d1 100644
--- a/modelscope/preprocessors/star/fields/preprocess_dataset.py
+++ b/modelscope/preprocessors/nlp/space_T_en/fields/preprocess_dataset.py
@@ -3,7 +3,7 @@
 from text2sql_lgesql.preprocess.parse_raw_json import Schema, get_schemas
 from text2sql_lgesql.process_sql import get_sql
 
-from modelscope.preprocessors.star.fields.parse import get_label
+from .parse import get_label
 
 
 def preprocess_dataset(processor, dataset, output_tables, database_id, tables):
diff --git a/modelscope/preprocessors/star/fields/process_dataset.py b/modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py
similarity index 94%
rename from modelscope/preprocessors/star/fields/process_dataset.py
rename to modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py
index d8ac094a..88059351 100644
--- a/modelscope/preprocessors/star/fields/process_dataset.py
+++ b/modelscope/preprocessors/nlp/space_T_en/fields/process_dataset.py
@@ -1,17 +1,12 @@
 # Copyright (c) rhythmcao modified from https://github.com/rhythmcao/text2sql-lgesql.
 
-import argparse
 import os
 import pickle
 import sys
-import time
 
-import json
 from text2sql_lgesql.asdl.asdl import ASDLGrammar
 from text2sql_lgesql.asdl.transition_system import TransitionSystem
 
-from modelscope.preprocessors.star.fields.common_utils import SubPreprocessor
-
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
 
diff --git a/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
new file mode 100644
index 00000000..5693d36e
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text2text_generation_preprocessor.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
+class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate')
+        kwargs['padding'] = kwargs.get('padding', False)
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        text_a, _, _ = self.parse_text_and_label(data)
+
+        inputs = self.tokenizer(
+            text_a,
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
+            **self.tokenize_kwargs)
+
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if 'token_type_ids' in inputs:
+            del inputs['token_type_ids']
+        return inputs
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
index 357a946f..4e5ba3bd 100644
--- a/modelscope/preprocessors/nlp/text_error_correction.py
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -7,11 +7,12 @@ from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
 from modelscope.utils.constant import Fields
+from .nlp_base import NLPBasePreprocessor
 
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_error_correction)
-class TextErrorCorrectionPreprocessor(Preprocessor):
+class TextErrorCorrectionPreprocessor(NLPBasePreprocessor):
     """The preprocessor used in text correction task.
     """
 
@@ -22,7 +23,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
         Args:
             model_dir (str): model path
         """
-        super().__init__(*args, **kwargs)
+        super().__init__(model_dir, *args, **kwargs)
         self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
 
     def __call__(self, data: str) -> Dict[str, Any]:
diff --git a/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
new file mode 100644
index 00000000..1e972d64
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_generation_jieba_preprocessor.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from typing import Any, Dict
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
+class TextGenerationJiebaPreprocessor(Preprocessor):
+    """The jieba tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
+        super().__init__(*args, **kwargs)
+        self.tokenizer = JiebaBPETokenizer(
+            osp.join(model_dir, 'tokenizer.json'))
+
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    '深蓝的天空中挂着一轮金黄的圆月，下面是海边的沙地'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+            Example:
+            {'net_input':
+                {'src_tokens':tensor([1,2,3,4]),
+                'src_lengths': tensor([4])}
+            }
+        """
+        import torch
+
+        return {
+            'input_ids':
+            torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0)
+        }
diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
new file mode 100644
index 00000000..238e2972
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_gen_tokenizer)
+class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in text generation.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 tokenizer=None,
+                 mode=ModeKeys.INFERENCE,
+                 **kwargs):
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get('padding', 'max_length')
+        kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids',
+                                                     False)
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    @staticmethod
+    def get_roberta_tokenizer_dir(model_dir: str) -> Optional[str]:
+        import os
+        for name in os.listdir(model_dir):
+            full_name = os.path.join(model_dir, name)
+            if 'roberta' in name and os.path.isdir(full_name):
+                return full_name
+
+    def build_tokenizer(self, model_dir: str):
+        roberta_tokenizer_dir = self.get_roberta_tokenizer_dir(model_dir)
+        if roberta_tokenizer_dir:
+            from transformers import RobertaTokenizer
+            return RobertaTokenizer.from_pretrained(
+                roberta_tokenizer_dir, do_lower_case=False)
+        return super().build_tokenizer(model_dir)
+
+    def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]:
+        if self._mode == ModeKeys.INFERENCE:
+            return super().__call__(data)
+        src_rst = super().__call__(data['src_txt'])
+        src_input_ids = src_rst['input_ids']
+        src_attention_mask = src_rst['attention_mask']
+        if 'tgt_txt' in data:
+            labels = super().__call__(data['tgt_txt'])['input_ids']
+        else:
+            labels = src_input_ids[1:]
+            src_input_ids = src_input_ids[:-1]
+            src_attention_mask = src_attention_mask[:-1]
+
+        return {
+            'input_ids': src_input_ids,
+            'attention_mask': src_attention_mask,
+            'labels': labels,
+        }
diff --git a/modelscope/preprocessors/nlp/text_ranking_preprocessor.py b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
new file mode 100644
index 00000000..2ada6892
--- /dev/null
+++ b/modelscope/preprocessors/nlp/text_ranking_preprocessor.py
@@ -0,0 +1,67 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.text_ranking)
+class TextRankingPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in passage ranking model.
+    """
+
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(model_dir, mode=mode, *args, **kwargs)
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'source_sentence')
+        self.second_sequence = kwargs.pop('second_sequence',
+                                          'sentences_to_compare')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[tuple, Dict]) -> Dict[str, Any]:
+        if isinstance(data, tuple):
+            sentence1, sentence2 = data
+        elif isinstance(data, dict):
+            sentence1 = data.get(self.first_sequence)
+            sentence2 = data.get(self.second_sequence)
+        if isinstance(sentence2, str):
+            sentence2 = [sentence2]
+        if isinstance(sentence1, str):
+            sentence1 = [sentence1]
+        sentence1 = sentence1 * len(sentence2)
+
+        max_seq_length = self.sequence_length
+        feature = self.tokenizer(
+            sentence1,
+            sentence2,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length,
+            return_tensors='pt')
+        if 'labels' in data:
+            labels = data['labels']
+            feature['labels'] = labels
+        if 'qid' in data:
+            qid = data['qid']
+            feature['qid'] = qid
+        return feature
diff --git a/modelscope/preprocessors/nlp/token_classification_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
new file mode 100644
index 00000000..2de0c806
--- /dev/null
+++ b/modelscope/preprocessors/nlp/token_classification_preprocessor.py
@@ -0,0 +1,261 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Tuple, Union
+
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .nlp_base import NLPBasePreprocessor, NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp,
+    module_name=Preprocessors.word_segment_text_to_label_preprocessor)
+class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
+    """The preprocessor used to turn a single sentence to a labeled token-classification dict.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.label = kwargs.pop('label', OutputKeys.LABELS)
+
+    def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
+        data = data.split(' ')
+        data = list(filter(lambda x: len(x) > 0, data))
+
+        def produce_train_sample(words):
+            chars = []
+            labels = []
+            for word in words:
+                chars.extend(list(word))
+                if len(word) == 1:
+                    labels.append('S-CWS')
+                else:
+                    labels.extend(['B-CWS'] + ['I-CWS'] * (len(word) - 2)
+                                  + ['E-CWS'])
+            assert len(chars) == len(labels)
+            return chars, labels
+
+        chars, labels = produce_train_sample(data)
+        return {
+            self.first_sequence: chars,
+            self.label: labels,
+        }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.ner_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sequence_labeling_tokenizer)
+class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in normal NER task.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        kwargs['truncation'] = kwargs.get('truncation', True)
+        kwargs['padding'] = kwargs.get(
+            'padding', False if mode == ModeKeys.INFERENCE else 'max_length')
+        kwargs['max_length'] = kwargs.pop('sequence_length', 128)
+        self.sequence_length = kwargs['max_length']
+        self.label_all_tokens = kwargs.pop('label_all_tokens', False)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+        if 'is_split_into_words' in kwargs:
+            self.is_split_into_words = kwargs.pop('is_split_into_words')
+        else:
+            self.is_split_into_words = self.tokenizer.init_kwargs.get(
+                'is_split_into_words', False)
+        if 'label2id' in kwargs:
+            kwargs.pop('label2id')
+        self.tokenize_kwargs = kwargs
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        # preprocess the data for the model input
+        text = None
+        labels_list = None
+        if isinstance(data, str):
+            text = data
+        elif isinstance(data, dict):
+            text = data.get(self.first_sequence)
+            labels_list = data.get(self.label)
+
+        input_ids = []
+        label_mask = []
+        offset_mapping = []
+        if self.is_split_into_words:
+            for offset, token in enumerate(list(data)):
+                subtoken_ids = self.tokenizer.encode(
+                    token, add_special_tokens=False)
+                if len(subtoken_ids) == 0:
+                    subtoken_ids = [self.tokenizer.unk_token_id]
+                input_ids.extend(subtoken_ids)
+                label_mask.extend([1] + [0] * (len(subtoken_ids) - 1))
+                offset_mapping.extend([(offset, offset + 1)])
+        else:
+            if self.tokenizer.is_fast:
+                encodings = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                    **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                word_ids = encodings.word_ids()
+                for i in range(len(word_ids)):
+                    if word_ids[i] is None:
+                        label_mask.append(0)
+                    elif word_ids[i] == word_ids[i - 1]:
+                        label_mask.append(0)
+                        offset_mapping[-1] = (
+                            offset_mapping[-1][0],
+                            encodings['offset_mapping'][i][1])
+                    else:
+                        label_mask.append(1)
+                        offset_mapping.append(encodings['offset_mapping'][i])
+            else:
+                encodings = self.tokenizer(
+                    text, add_special_tokens=False, **self.tokenize_kwargs)
+                input_ids = encodings['input_ids']
+                label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
+                    text)
+
+        if len(input_ids) >= self.sequence_length - 2:
+            input_ids = input_ids[:self.sequence_length - 2]
+            label_mask = label_mask[:self.sequence_length - 2]
+        input_ids = [self.tokenizer.cls_token_id
+                     ] + input_ids + [self.tokenizer.sep_token_id]
+        label_mask = [0] + label_mask + [0]
+        attention_mask = [1] * len(input_ids)
+        offset_mapping = offset_mapping[:sum(label_mask)]
+
+        if not self.is_transformer_based_model:
+            input_ids = input_ids[1:-1]
+            attention_mask = attention_mask[1:-1]
+            label_mask = label_mask[1:-1]
+
+        if self._mode == ModeKeys.INFERENCE:
+            input_ids = torch.tensor(input_ids).unsqueeze(0)
+            attention_mask = torch.tensor(attention_mask).unsqueeze(0)
+            label_mask = torch.tensor(
+                label_mask, dtype=torch.bool).unsqueeze(0)
+
+        # the token classification
+        output = {
+            'text': text,
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'label_mask': label_mask,
+            'offset_mapping': offset_mapping
+        }
+
+        # align the labels with tokenized text
+        if labels_list is not None:
+            assert self.label2id is not None
+            # Map that sends B-Xxx label to its I-Xxx counterpart
+            b_to_i_label = []
+            label_enumerate_values = [
+                k for k, v in sorted(
+                    self.label2id.items(), key=lambda item: item[1])
+            ]
+            for idx, label in enumerate(label_enumerate_values):
+                if label.startswith('B-') and label.replace(
+                        'B-', 'I-') in label_enumerate_values:
+                    b_to_i_label.append(
+                        label_enumerate_values.index(
+                            label.replace('B-', 'I-')))
+                else:
+                    b_to_i_label.append(idx)
+
+            label_row = [self.label2id[lb] for lb in labels_list]
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                if word_idx is None:
+                    label_ids.append(-100)
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_row[word_idx])
+                else:
+                    if self.label_all_tokens:
+                        label_ids.append(b_to_i_label[label_row[word_idx]])
+                    else:
+                        label_ids.append(-100)
+                previous_word_idx = word_idx
+            labels = label_ids
+            output['labels'] = labels
+        return output
+
+    def get_tokenizer_class(self):
+        tokenizer_class = self.tokenizer.__class__.__name__
+        if tokenizer_class.endswith(
+                'Fast') and tokenizer_class != 'PreTrainedTokenizerFast':
+            tokenizer_class = tokenizer_class[:-4]
+        return tokenizer_class
+
+    def get_label_mask_and_offset_mapping(self, text):
+        label_mask = []
+        offset_mapping = []
+        tokens = self.tokenizer.tokenize(text)
+        offset = 0
+        if self.get_tokenizer_class() == 'BertTokenizer':
+            for token in tokens:
+                is_start = (token[:2] != '##')
+                if is_start:
+                    label_mask.append(True)
+                else:
+                    token = token[2:]
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+        elif self.get_tokenizer_class() == 'XLMRobertaTokenizer':
+            last_is_blank = False
+            for token in tokens:
+                is_start = (token[0] == '▁')
+                if is_start:
+                    token = token[1:]
+                    label_mask.append(True)
+                    if len(token) == 0:
+                        last_is_blank = True
+                        continue
+                else:
+                    label_mask.append(False)
+                start = offset + text[offset:].index(token)
+                end = start + len(token)
+                if last_is_blank or is_start:
+                    offset_mapping.append((start, end))
+                else:
+                    offset_mapping[-1] = (offset_mapping[-1][0], end)
+                offset = end
+                last_is_blank = False
+        else:
+            raise NotImplementedError
+
+        return label_mask, offset_mapping
diff --git a/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
new file mode 100644
index 00000000..eb3c4b37
--- /dev/null
+++ b/modelscope/preprocessors/nlp/zero_shot_classification_reprocessor.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from .nlp_base import NLPTokenizerPreprocessorBase
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
+    """The tokenizer preprocessor used in zero shot classification.
+    """
+
+    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
+        """preprocess the data
+
+        Args:
+            model_dir (str): model path
+        """
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        super().__init__(model_dir, mode=mode, **kwargs)
+
+    def __call__(self, data: Union[str, Dict], hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str or dict): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        if isinstance(data, dict):
+            data = data.get(self.first_sequence)
+
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            truncation_strategy='only_first',
+            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None)
+        return features
diff --git a/modelscope/preprocessors/space/fields/__init__.py b/modelscope/preprocessors/space/fields/__init__.py
deleted file mode 100644
index 925eac71..00000000
--- a/modelscope/preprocessors/space/fields/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .gen_field import MultiWOZBPETextField
-from .intent_field import IntentBPETextField
diff --git a/modelscope/preprocessors/space/fields/dst_processors.py b/modelscope/preprocessors/space/fields/dst_processors.py
deleted file mode 100644
index 22e06eec..00000000
--- a/modelscope/preprocessors/space/fields/dst_processors.py
+++ /dev/null
@@ -1,1523 +0,0 @@
-#
-# Copyright 2020 Heinrich Heine University Duesseldorf
-#
-# Part of this code is based on the source code of BERT-DST
-# (arXiv:1907.03040)
-# Part of this code is based on the source code of Transformers
-# (arXiv:1910.03771)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import re
-
-import json
-import numpy as np
-import six
-from tqdm import tqdm
-
-logger = logging.getLogger(__name__)
-USER_NAME = 'User'
-SYSTEM_NAME = 'System'
-DIALOG_ACT = 'Dialog_Act'
-
-utter1 = {
-    'User-1':
-    "I'd really like to take my client out to a nice restaurant that serves indian food."
-}
-history_states1 = [
-    {},
-]
-utter2 = {
-    'User-1':
-    "I'd really like to take my client out to a nice restaurant that serves indian food.",
-    'System-1':
-    'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?',
-    'Dialog_Act-1': {
-        'Restaurant-Inform': [['choice', 'many'], ['food', 'Indian'],
-                              ['pricerange', 'that price range']]
-    },
-    'User-2':
-    'I am looking for an expensive indian restaurant in the area of centre.',
-}
-
-history_states2 = [{}, {
-    'attraction': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'area': '',
-            'name': '',
-            'type': ''
-        }
-    },
-    'hospital': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'department': ''
-        }
-    },
-    'hotel': {
-        'book': {
-            'booked': [{
-                'name': 'alexander bed and breakfast',
-                'reference': 'JXVKZ7KV'
-            }],
-            'day':
-            'sunday',
-            'people':
-            '6',
-            'stay':
-            '4'
-        },
-        'semi': {
-            'area': '',
-            'internet': 'yes',
-            'name': 'alexander bed and breakfast',
-            'parking': 'yes',
-            'pricerange': 'cheap',
-            'stars': '',
-            'type': 'guesthouse'
-        }
-    },
-    'police': {
-        'book': {
-            'booked': []
-        },
-        'semi': {}
-    },
-    'restaurant': {
-        'book': {
-            'booked': [{
-                'name': 'ask',
-                'reference': 'Y2Y8QYBY'
-            }],
-            'day': 'sunday',
-            'people': '6',
-            'time': '18:45'
-        },
-        'semi': {
-            'area': 'centre',
-            'food': 'italian',
-            'name': 'ask',
-            'pricerange': 'cheap'
-        }
-    },
-    'taxi': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'arriveBy': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    },
-    'train': {
-        'book': {
-            'booked': [],
-            'people': ''
-        },
-        'semi': {
-            'arriveBy': '',
-            'day': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    }
-}, {}]
-
-utter3 = {
-    'User-1':
-    "I'd really like to take my client out to a nice restaurant that serves indian food.",
-    'System-1':
-    'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?',
-    'Dialog_Act-1': {
-        'Restaurant-Inform': [['choice', 'many'], ['food', 'Indian'],
-                              ['pricerange', 'that price range']]
-    },
-    'User-2':
-    'I am looking for an expensive indian restaurant in the area of centre.',
-    'System-2':
-    'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant '
-    'in the center of town. I can book a table for you, if you like.',
-    'Dialog_Act-2': {
-        'Restaurant-Recommend': [['area', 'center of town'],
-                                 ['food', 'Indian'],
-                                 ['name', 'Saffron Brasserie'],
-                                 ['pricerange', 'expensive']]
-    },
-    'User-3':
-    'Sure thing, please book for 6 people at 19:30 on Saturday.'
-}
-
-history_states3 = [{}, {
-    'attraction': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'area': '',
-            'name': '',
-            'type': ''
-        }
-    },
-    'hospital': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'department': ''
-        }
-    },
-    'hotel': {
-        'book': {
-            'booked': [{
-                'name': 'alexander bed and breakfast',
-                'reference': 'JXVKZ7KV'
-            }],
-            'day':
-            'sunday',
-            'people':
-            '6',
-            'stay':
-            '4'
-        },
-        'semi': {
-            'area': '',
-            'internet': 'yes',
-            'name': 'alexander bed and breakfast',
-            'parking': 'yes',
-            'pricerange': 'cheap',
-            'stars': '',
-            'type': 'guesthouse'
-        }
-    },
-    'police': {
-        'book': {
-            'booked': []
-        },
-        'semi': {}
-    },
-    'restaurant': {
-        'book': {
-            'booked': [{
-                'name': 'ask',
-                'reference': 'Y2Y8QYBY'
-            }],
-            'day': 'sunday',
-            'people': '6',
-            'time': '18:45'
-        },
-        'semi': {
-            'area': 'centre',
-            'food': 'italian',
-            'name': 'ask',
-            'pricerange': 'cheap'
-        }
-    },
-    'taxi': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'arriveBy': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    },
-    'train': {
-        'book': {
-            'booked': [],
-            'people': ''
-        },
-        'semi': {
-            'arriveBy': '',
-            'day': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    }
-}, {}, {
-    'attraction': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'area': '',
-            'name': '',
-            'type': ''
-        }
-    },
-    'hospital': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'department': ''
-        }
-    },
-    'hotel': {
-        'book': {
-            'booked': [{
-                'name': 'alexander bed and breakfast',
-                'reference': 'JXVKZ7KV'
-            }],
-            'day':
-            'sunday',
-            'people':
-            '6',
-            'stay':
-            '4'
-        },
-        'semi': {
-            'area': '',
-            'internet': 'yes',
-            'name': 'alexander bed and breakfast',
-            'parking': 'yes',
-            'pricerange': 'cheap',
-            'stars': '',
-            'type': 'guesthouse'
-        }
-    },
-    'police': {
-        'book': {
-            'booked': []
-        },
-        'semi': {}
-    },
-    'restaurant': {
-        'book': {
-            'booked': [{
-                'name': 'ask',
-                'reference': 'Y2Y8QYBY'
-            }],
-            'day': 'sunday',
-            'people': '6',
-            'time': '18:45'
-        },
-        'semi': {
-            'area': 'centre',
-            'food': 'italian',
-            'name': 'ask',
-            'pricerange': 'cheap'
-        }
-    },
-    'taxi': {
-        'book': {
-            'booked': []
-        },
-        'semi': {
-            'arriveBy': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    },
-    'train': {
-        'book': {
-            'booked': [],
-            'people': ''
-        },
-        'semi': {
-            'arriveBy': '',
-            'day': '',
-            'departure': '',
-            'destination': '',
-            'leaveAt': ''
-        }
-    }
-}, {}]
-
-
-class DSTProcessor(object):
-    ACTS_DICT = {
-        'taxi-depart': 'taxi-departure',
-        'taxi-dest': 'taxi-destination',
-        'taxi-leaveat': 'taxi-leaveAt',
-        'taxi-arriveby': 'taxi-arriveBy',
-        'train-depart': 'train-departure',
-        'train-dest': 'train-destination',
-        'train-leaveat': 'train-leaveAt',
-        'train-arriveby': 'train-arriveBy',
-        'train-bookpeople': 'train-book_people',
-        'restaurant-price': 'restaurant-pricerange',
-        'restaurant-bookpeople': 'restaurant-book_people',
-        'restaurant-bookday': 'restaurant-book_day',
-        'restaurant-booktime': 'restaurant-book_time',
-        'hotel-price': 'hotel-pricerange',
-        'hotel-bookpeople': 'hotel-book_people',
-        'hotel-bookday': 'hotel-book_day',
-        'hotel-bookstay': 'hotel-book_stay',
-        'booking-bookpeople': 'booking-book_people',
-        'booking-bookday': 'booking-book_day',
-        'booking-bookstay': 'booking-book_stay',
-        'booking-booktime': 'booking-book_time',
-    }
-
-    LABEL_MAPS = {}  # Loaded from file
-
-    def __init__(self):
-        # Required for mapping slot names in dialogue_acts.json file
-        # to proper designations.
-        pass
-
-    def _convert_inputs_to_utterances(self, inputs: dict,
-                                      history_states: list):
-        """This method is to generate the utterances with user, sys, dialog_acts and metadata,
-         while metadata is from the history_states or the output from the inference pipline"""
-
-        utterances = []
-        user_inputs = []
-        sys_gen_inputs = []
-        dialog_acts_inputs = []
-        for i, item in enumerate(inputs):
-            name, turn = item.split('-')
-            if name == USER_NAME:
-                user_inputs.insert(int(turn) - 1, inputs[item])
-            elif name == SYSTEM_NAME:
-                sys_gen_inputs.insert(int(turn) - 1, inputs[item])
-            else:
-                dialog_acts_inputs.insert(int(turn) - 1, inputs[item])
-
-        # user is leading the topic should aways larger than sys and dialog acts
-        assert len(user_inputs) - 1 == len(sys_gen_inputs)
-        assert len(user_inputs) - 1 == len(dialog_acts_inputs)
-        # the history states record both user and sys states
-        assert len(history_states) == len(user_inputs) + len(sys_gen_inputs)
-
-        # the dialog_act at user turn is useless
-        for i, item in enumerate(history_states):
-            utterance = {}
-            # the dialog_act at user turn is useless
-            utterance['dialog_act'] = dialog_acts_inputs[
-                i // 2] if i % 2 == 1 else {}
-            utterance['text'] = sys_gen_inputs[
-                i // 2] if i % 2 == 1 else user_inputs[i // 2]
-            utterance['metadata'] = item
-            utterance['span_info'] = []
-            utterances.append(utterance)
-
-        return utterances
-
-    def _load_acts(self, inputs: dict, dialog_id='example.json'):
-        dialog_acts_inputs = []
-        for i, item in enumerate(inputs):
-            name, turn = item.split('-')
-            if name == DIALOG_ACT:
-                dialog_acts_inputs.insert(int(turn) - 1, inputs[item])
-        s_dict = {}
-
-        for j, item in enumerate(dialog_acts_inputs):
-            if isinstance(item, dict):
-                for a in item:
-                    aa = a.lower().split('-')
-                    if aa[1] == 'inform' or aa[1] == 'recommend' or \
-                            aa[1] == 'select' or aa[1] == 'book':
-                        for i in item[a]:
-                            s = i[0].lower()
-                            v = i[1].lower().strip()
-                            if s == 'none' or v == '?' or v == 'none':
-                                continue
-                            slot = aa[0] + '-' + s
-                            if slot in self.ACTS_DICT:
-                                slot = self.ACTS_DICT[slot]
-                            key = dialog_id, str(int(j) + 1), slot
-                            # In case of multiple mentioned values...
-                            # ... Option 1: Keep first informed value
-                            if key not in s_dict:
-                                s_dict[key] = list([v])
-                            # ... Option 2: Keep last informed value
-                            # s_dict[key] = list([v])
-
-        return s_dict
-
-
-class multiwoz22Processor(DSTProcessor):
-
-    def __init__(self):
-        super().__init__()
-
-    def normalize_time(self, text):
-        text = re.sub(r'(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2',
-                      text)  # am/pm without space
-        text = re.sub(r'(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3',
-                      text)  # am/pm short to long form
-        text = re.sub(
-            r'(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)',
-            r'\1\2 \3:\4\5', text)  # Missing separator
-        text = re.sub(r'(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3',
-                      text)  # Wrong separator
-        text = re.sub(r'(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)',
-                      r'\1\2 \3:00\4', text)  # normalize simple full hour time
-        text = re.sub(r'(^| )(\d{1}:\d{2})', r'\g<1>0\2',
-                      text)  # Add missing leading 0
-        # Map 12 hour times to 24 hour times
-        text = \
-            re.sub(
-                r'(\d{2})(:\d{2}) ?p\.?m\.?',
-                lambda x: str(int(x.groups()[0]) + 12
-                              if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups()[1], text)
-        text = re.sub(r'(^| )24:(\d{2})', r'\g<1>00:\2',
-                      text)  # Correct times that use 24 as hour
-        return text
-
-    def normalize_text(self, text):
-        text = self.normalize_time(text)
-        text = re.sub("n't", ' not', text)
-        text = re.sub('(^| )zero(-| )star([s.,? ]|$)', r'\g<1>0 star\3', text)
-        text = re.sub('(^| )one(-| )star([s.,? ]|$)', r'\g<1>1 star\3', text)
-        text = re.sub('(^| )two(-| )star([s.,? ]|$)', r'\g<1>2 star\3', text)
-        text = re.sub('(^| )three(-| )star([s.,? ]|$)', r'\g<1>3 star\3', text)
-        text = re.sub('(^| )four(-| )star([s.,? ]|$)', r'\g<1>4 star\3', text)
-        text = re.sub('(^| )five(-| )star([s.,? ]|$)', r'\g<1>5 star\3', text)
-        text = re.sub('archaelogy', 'archaeology', text)  # Systematic typo
-        text = re.sub('guesthouse', 'guest house', text)  # Normalization
-        text = re.sub('(^| )b ?& ?b([.,? ]|$)', r'\1bed and breakfast\2',
-                      text)  # Normalization
-        text = re.sub('bed & breakfast', 'bed and breakfast',
-                      text)  # Normalization
-        return text
-
-    # Loads the dialogue_acts.json and returns a list
-    # of slot-value pairs.
-    def load_acts(self, input_file):
-        with open(input_file) as f:
-            acts = json.load(f)
-        s_dict = {}
-        for d in acts:
-            for t in acts[d]:
-                if int(t) % 2 == 0:
-                    continue
-                # Only process, if turn has annotation
-                if isinstance(acts[d][t]['dialog_act'], dict):
-                    for a in acts[d][t]['dialog_act']:
-                        aa = a.lower().split('-')
-                        if aa[1] == 'inform' or aa[1] == 'recommend' \
-                                or aa[1] == 'select' or aa[1] == 'book':
-                            for i in acts[d][t]['dialog_act'][a]:
-                                s = i[0].lower()
-                                v = i[1].lower().strip()
-                                if s == 'none' or v == '?' or v == 'none':
-                                    continue
-                                slot = aa[0] + '-' + s
-                                if slot in self.ACTS_DICT:
-                                    slot = self.ACTS_DICT[slot]
-                                key = d, str(int(t) // 2 + 1), slot
-                                # In case of multiple mentioned values...
-                                # ... Option 1: Keep first informed value
-                                if key not in s_dict:
-                                    s_dict[key] = list([v])
-                                # ... Option 2: Keep last informed value
-                                # s_dict[key] = list([v])
-        return s_dict
-
-    # This should only contain label normalizations. All other mappings should
-    # be defined in LABEL_MAPS.
-    def normalize_label(self, slot, value_label):
-        # Normalization of empty slots
-        if value_label == '' or value_label == 'not mentioned':
-            return 'none'
-
-        # Normalization of time slots
-        if 'leaveAt' in slot or 'arriveBy' in slot or slot == 'restaurant-book_time':
-            return self.normalize_time(value_label)
-
-        # Normalization
-        if 'type' in slot or 'name' in slot or 'destination' in slot or 'departure' in slot:
-            value_label = re.sub('guesthouse', 'guest house', value_label)
-
-        # Map to boolean slots
-        if slot == 'hotel-parking' or slot == 'hotel-internet':
-            if value_label == 'yes' or value_label == 'free':
-                return 'true'
-            if value_label == 'no':
-                return 'false'
-        if slot == 'hotel-type':
-            if value_label == 'hotel':
-                return 'true'
-            if value_label == 'guest house':
-                return 'false'
-
-        return value_label
-
-    def tokenize(self, utt):
-        utt_lower = convert_to_unicode(utt).lower()
-        utt_lower = self.normalize_text(utt_lower)
-        utt_tok = [
-            tok for tok in map(str.strip, re.split(r'(\W+)', utt_lower))
-            if len(tok) > 0
-        ]
-        return utt_tok
-
-    def delex_utt(self, utt, values, unk_token='[UNK]'):
-        utt_norm = self.tokenize(utt)
-        for s, vals in values.items():
-            for v in vals:
-                if v != 'none':
-                    v_norm = self.tokenize(v)
-                    v_len = len(v_norm)
-                    for i in range(len(utt_norm) + 1 - v_len):
-                        if utt_norm[i:i + v_len] == v_norm:
-                            utt_norm[i:i + v_len] = [unk_token] * v_len
-        return utt_norm
-
-    def get_token_pos(self, tok_list, value_label):
-        find_pos = []
-        found = False
-        label_list = [
-            item for item in map(str.strip, re.split(r'(\W+)', value_label))
-            if len(item) > 0
-        ]
-        len_label = len(label_list)
-        for i in range(len(tok_list) + 1 - len_label):
-            if tok_list[i:i + len_label] == label_list:
-                find_pos.append((i, i + len_label))  # start, exclusive_end
-                found = True
-        return found, find_pos
-
-    def check_label_existence(self, value_label, usr_utt_tok):
-        in_usr, usr_pos = self.get_token_pos(usr_utt_tok, value_label)
-        # If no hit even though there should be one, check for value label variants
-        if not in_usr and value_label in self.LABEL_MAPS:
-            for value_label_variant in self.LABEL_MAPS[value_label]:
-                in_usr, usr_pos = self.get_token_pos(usr_utt_tok,
-                                                     value_label_variant)
-                if in_usr:
-                    break
-        return in_usr, usr_pos
-
-    def check_slot_referral(self, value_label, slot, seen_slots):
-        referred_slot = 'none'
-        if slot == 'hotel-stars' or slot == 'hotel-internet' or slot == 'hotel-parking':
-            return referred_slot
-        for s in seen_slots:
-            # Avoid matches for slots that share values with different meaning.
-            # hotel-internet and -parking are handled separately as Boolean slots.
-            if s == 'hotel-stars' or s == 'hotel-internet' or s == 'hotel-parking':
-                continue
-            if re.match('(hotel|restaurant)-book_people',
-                        s) and slot == 'hotel-book_stay':
-                continue
-            if re.match('(hotel|restaurant)-book_people',
-                        slot) and s == 'hotel-book_stay':
-                continue
-            if slot != s and (slot not in seen_slots
-                              or seen_slots[slot] != value_label):
-                if seen_slots[s] == value_label:
-                    referred_slot = s
-                    break
-                elif value_label in self.LABEL_MAPS:
-                    for value_label_variant in self.LABEL_MAPS[value_label]:
-                        if seen_slots[s] == value_label_variant:
-                            referred_slot = s
-                            break
-        return referred_slot
-
-    def is_in_list(self, tok, value):
-        found = False
-        tok_list = [
-            item for item in map(str.strip, re.split(r'(\W+)', tok))
-            if len(item) > 0
-        ]
-        value_list = [
-            item for item in map(str.strip, re.split(r'(\W+)', value))
-            if len(item) > 0
-        ]
-        tok_len = len(tok_list)
-        value_len = len(value_list)
-        for i in range(tok_len + 1 - value_len):
-            if tok_list[i:i + value_len] == value_list:
-                found = True
-                break
-        return found
-
-    # Fuzzy matching to label informed slot values
-    def check_slot_inform(self, value_label, inform_label):
-        result = False
-        informed_value = 'none'
-        vl = ' '.join(self.tokenize(value_label))
-        for il in inform_label:
-            if vl == il:
-                result = True
-            elif self.is_in_list(il, vl):
-                result = True
-            elif self.is_in_list(vl, il):
-                result = True
-            elif il in self.LABEL_MAPS:
-                for il_variant in self.LABEL_MAPS[il]:
-                    if vl == il_variant:
-                        result = True
-                        break
-                    elif self.is_in_list(il_variant, vl):
-                        result = True
-                        break
-                    elif self.is_in_list(vl, il_variant):
-                        result = True
-                        break
-            elif vl in self.LABEL_MAPS:
-                for value_label_variant in self.LABEL_MAPS[vl]:
-                    if value_label_variant == il:
-                        result = True
-                        break
-                    elif self.is_in_list(il, value_label_variant):
-                        result = True
-                        break
-                    elif self.is_in_list(value_label_variant, il):
-                        result = True
-                        break
-            if result:
-                informed_value = il
-                break
-        return result, informed_value
-
-    def get_turn_label(self, value_label, inform_label, sys_utt_tok,
-                       usr_utt_tok, slot, seen_slots, slot_last_occurrence):
-        usr_utt_tok_label = [0 for _ in usr_utt_tok]
-        informed_value = 'none'
-        referred_slot = 'none'
-        if value_label == 'none' or value_label == 'dontcare' or value_label == 'true' or value_label == 'false':
-            class_type = value_label
-        else:
-            in_usr, usr_pos = self.check_label_existence(
-                value_label, usr_utt_tok)
-            is_informed, informed_value = self.check_slot_inform(
-                value_label, inform_label)
-            if in_usr:
-                class_type = 'copy_value'
-                if slot_last_occurrence:
-                    (s, e) = usr_pos[-1]
-                    for i in range(s, e):
-                        usr_utt_tok_label[i] = 1
-                else:
-                    for (s, e) in usr_pos:
-                        for i in range(s, e):
-                            usr_utt_tok_label[i] = 1
-            elif is_informed:
-                class_type = 'inform'
-            else:
-                referred_slot = self.check_slot_referral(
-                    value_label, slot, seen_slots)
-                if referred_slot != 'none':
-                    class_type = 'refer'
-                else:
-                    class_type = 'unpointable'
-        return informed_value, referred_slot, usr_utt_tok_label, class_type
-
-    def _create_example(self,
-                        utterances,
-                        sys_inform_dict,
-                        set_type,
-                        slot_list,
-                        label_maps={},
-                        append_history=False,
-                        use_history_labels=False,
-                        swap_utterances=False,
-                        label_value_repetitions=False,
-                        delexicalize_sys_utts=False,
-                        unk_token='[UNK]',
-                        analyze=False,
-                        dialog_id='example.json'):
-
-        # Collects all slot changes throughout the dialog
-        cumulative_labels = {slot: 'none' for slot in slot_list}
-
-        # First system utterance is empty, since multiwoz starts with user input
-        utt_tok_list = [[]]
-        mod_slots_list = []
-
-        # Collect all utterances and their metadata
-        usr_sys_switch = True
-        turn_itr = 0
-
-        for utt in utterances:
-            # Assert that system and user utterances alternate
-            is_sys_utt = utt['metadata'] != {}
-            if usr_sys_switch == is_sys_utt:
-                print(
-                    'WARN: Wrong order of system and user utterances. Skipping rest of the dialog %s'
-                    % (dialog_id))
-                break
-            usr_sys_switch = is_sys_utt
-
-            if is_sys_utt:
-                turn_itr += 1
-
-            # Delexicalize sys utterance
-            if delexicalize_sys_utts and is_sys_utt:
-                inform_dict = {slot: 'none' for slot in slot_list}
-                for slot in slot_list:
-                    if (str(dialog_id), str(turn_itr),
-                            slot) in sys_inform_dict:
-                        inform_dict[slot] = sys_inform_dict[(str(dialog_id),
-                                                             str(turn_itr),
-                                                             slot)]
-                utt_tok_list.append(
-                    self.delex_utt(utt['text'], inform_dict,
-                                   unk_token))  # normalize utterances
-            else:
-                utt_tok_list.append(self.tokenize(
-                    utt['text']))  # normalize utterances
-
-            modified_slots = {}
-
-            # If sys utt, extract metadata (identify and collect modified slots)
-            if is_sys_utt:
-                for d in utt['metadata']:
-                    booked = utt['metadata'][d]['book']['booked']
-                    booked_slots = {}
-                    # Check the booked section
-                    if booked != []:
-                        for s in booked[0]:
-                            booked_slots[s] = self.normalize_label(
-                                '%s-%s' % (d, s),
-                                booked[0][s])  # normalize labels
-                    # Check the semi and the inform slots
-                    for category in ['book', 'semi']:
-                        for s in utt['metadata'][d][category]:
-                            cs = '%s-book_%s' % (
-                                d, s) if category == 'book' else '%s-%s' % (d,
-                                                                            s)
-                            value_label = self.normalize_label(
-                                cs, utt['metadata'][d][category]
-                                [s])  # normalize labels
-                            # Prefer the slot value as stored in the booked section
-                            if s in booked_slots:
-                                value_label = booked_slots[s]
-                            # Remember modified slots and entire dialog state
-                            if cs in slot_list and cumulative_labels[
-                                    cs] != value_label:
-                                modified_slots[cs] = value_label
-                                cumulative_labels[cs] = value_label
-
-            mod_slots_list.append(modified_slots.copy())
-
-        # Form proper (usr, sys) turns
-        turn_itr = 0
-        diag_seen_slots_dict = {}
-        diag_seen_slots_value_dict = {slot: 'none' for slot in slot_list}
-        diag_state = {slot: 'none' for slot in slot_list}
-        sys_utt_tok = []
-        usr_utt_tok = []
-        hst_utt_tok = []
-        hst_utt_tok_label_dict = {slot: [] for slot in slot_list}
-        new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy()
-        new_diag_state = diag_state.copy()
-
-        for i in range(0, len(utt_tok_list) - 1, 2):
-            sys_utt_tok_label_dict = {}
-            usr_utt_tok_label_dict = {}
-            value_dict = {}
-            inform_dict = {}
-            inform_slot_dict = {}
-            referral_dict = {}
-            class_type_dict = {}
-
-            # Collect turn data
-            if append_history:
-                if swap_utterances:
-                    hst_utt_tok = usr_utt_tok + sys_utt_tok + hst_utt_tok
-                else:
-                    hst_utt_tok = sys_utt_tok + usr_utt_tok + hst_utt_tok
-            sys_utt_tok = utt_tok_list[i]
-            usr_utt_tok = utt_tok_list[i + 1]
-            turn_slots = mod_slots_list[
-                i + 1] if len(mod_slots_list) > 1 else {}
-
-            guid = '%s-%s-%s' % (set_type, str(dialog_id), str(turn_itr))
-
-            if analyze:
-                print('%15s %2s %s ||| %s' %
-                      (dialog_id, turn_itr, ' '.join(sys_utt_tok),
-                       ' '.join(usr_utt_tok)))
-                print('%15s %2s [' % (dialog_id, turn_itr), end='')
-
-            new_hst_utt_tok_label_dict = hst_utt_tok_label_dict.copy()
-            new_diag_state = diag_state.copy()
-            for slot in slot_list:
-                value_label = 'none'
-                if slot in turn_slots:
-                    value_label = turn_slots[slot]
-                    # We keep the original labels so as to not
-                    # overlook unpointable values, as well as to not
-                    # modify any of the original labels for test sets,
-                    # since this would make comparison difficult.
-                    value_dict[slot] = value_label
-                elif label_value_repetitions and slot in diag_seen_slots_dict:
-                    value_label = diag_seen_slots_value_dict[slot]
-
-                # Get dialog act annotations
-                inform_label = list(['none'])
-                inform_slot_dict[slot] = 0
-                if (str(dialog_id), str(turn_itr), slot) in sys_inform_dict:
-                    inform_label = list([
-                        self.normalize_label(slot, i)
-                        for i in sys_inform_dict[(str(dialog_id),
-                                                  str(turn_itr), slot)]
-                    ])
-                    inform_slot_dict[slot] = 1
-                elif (str(dialog_id), str(turn_itr),
-                      'booking-' + slot.split('-')[1]) in sys_inform_dict:
-                    inform_label = list([
-                        self.normalize_label(slot, i)
-                        for i in sys_inform_dict[(str(dialog_id),
-                                                  str(turn_itr), 'booking-'
-                                                  + slot.split('-')[1])]
-                    ])
-                    inform_slot_dict[slot] = 1
-
-                (informed_value, referred_slot, usr_utt_tok_label,
-                 class_type) = self.get_turn_label(
-                     value_label,
-                     inform_label,
-                     sys_utt_tok,
-                     usr_utt_tok,
-                     slot,
-                     diag_seen_slots_value_dict,
-                     slot_last_occurrence=True)
-
-                inform_dict[slot] = informed_value
-
-                # Generally don't use span prediction on sys utterance (but inform prediction instead).
-                sys_utt_tok_label = [0 for _ in sys_utt_tok]
-
-                # Determine what to do with value repetitions.
-                # If value is unique in seen slots, then tag it, otherwise not,
-                # since correct slot assignment can not be guaranteed anymore.
-                if label_value_repetitions and slot in diag_seen_slots_dict:
-                    if class_type == 'copy_value' and list(
-                            diag_seen_slots_value_dict.values()).count(
-                                value_label) > 1:
-                        class_type = 'none'
-                        usr_utt_tok_label = [0 for _ in usr_utt_tok_label]
-
-                sys_utt_tok_label_dict[slot] = sys_utt_tok_label
-                usr_utt_tok_label_dict[slot] = usr_utt_tok_label
-
-                if append_history:
-                    if use_history_labels:
-                        if swap_utterances:
-                            new_hst_utt_tok_label_dict[
-                                slot] = usr_utt_tok_label + sys_utt_tok_label + new_hst_utt_tok_label_dict[
-                                    slot]
-                        else:
-                            new_hst_utt_tok_label_dict[
-                                slot] = sys_utt_tok_label + usr_utt_tok_label + new_hst_utt_tok_label_dict[
-                                    slot]
-                    else:
-                        new_hst_utt_tok_label_dict[slot] = [
-                            0 for _ in sys_utt_tok_label + usr_utt_tok_label
-                            + new_hst_utt_tok_label_dict[slot]
-                        ]
-
-                # For now, we map all occurences of unpointable slot values
-                # to none. However, since the labels will still suggest
-                # a presence of unpointable slot values, the task of the
-                # DST is still to find those values. It is just not
-                # possible to do that via span prediction on the current input.
-                if class_type == 'unpointable':
-                    class_type_dict[slot] = 'none'
-                    referral_dict[slot] = 'none'
-                    if analyze:
-                        if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[
-                                slot]:
-                            print('(%s): %s, ' % (slot, value_label), end='')
-                elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] \
-                        and class_type != 'copy_value' and class_type != 'inform':
-                    # If slot has seen before and its class type did not change, label this slot a not present,
-                    # assuming that the slot has not actually been mentioned in this turn.
-                    # Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform,
-                    # this must mean there is evidence in the original labels, therefore consider
-                    # them as mentioned again.
-                    class_type_dict[slot] = 'none'
-                    referral_dict[slot] = 'none'
-                else:
-                    class_type_dict[slot] = class_type
-                    referral_dict[slot] = referred_slot
-                # Remember that this slot was mentioned during this dialog already.
-                if class_type != 'none':
-                    diag_seen_slots_dict[slot] = class_type
-                    diag_seen_slots_value_dict[slot] = value_label
-                    new_diag_state[slot] = class_type
-                    # Unpointable is not a valid class, therefore replace with
-                    # some valid class for now...
-                    if class_type == 'unpointable':
-                        new_diag_state[slot] = 'copy_value'
-
-            if analyze:
-                print(']')
-
-            if swap_utterances:
-                txt_a = usr_utt_tok
-                txt_b = sys_utt_tok
-                txt_a_lbl = usr_utt_tok_label_dict
-                txt_b_lbl = sys_utt_tok_label_dict
-            else:
-                txt_a = sys_utt_tok
-                txt_b = usr_utt_tok
-                txt_a_lbl = sys_utt_tok_label_dict
-                txt_b_lbl = usr_utt_tok_label_dict
-
-            example = DSTExample(
-                guid=guid,
-                text_a=txt_a,
-                text_b=txt_b,
-                history=hst_utt_tok,
-                text_a_label=txt_a_lbl,
-                text_b_label=txt_b_lbl,
-                history_label=hst_utt_tok_label_dict,
-                values=diag_seen_slots_value_dict.copy(),
-                inform_label=inform_dict,
-                inform_slot_label=inform_slot_dict,
-                refer_label=referral_dict,
-                diag_state=diag_state,
-                class_label=class_type_dict)
-        # Update some variables.
-        hst_utt_tok_label_dict = new_hst_utt_tok_label_dict.copy()
-        diag_state = new_diag_state.copy()
-
-        turn_itr += 1
-        return example
-
-    def create_example(self,
-                       inputs,
-                       history_states,
-                       set_type,
-                       slot_list,
-                       label_maps={},
-                       append_history=False,
-                       use_history_labels=False,
-                       swap_utterances=False,
-                       label_value_repetitions=False,
-                       delexicalize_sys_utts=False,
-                       unk_token='[UNK]',
-                       analyze=False,
-                       dialog_id='0'):
-        utterances = self._convert_inputs_to_utterances(inputs, history_states)
-        sys_inform_dict = self._load_acts(inputs)
-        self.LABEL_MAPS = label_maps
-        example = self._create_example(utterances, sys_inform_dict, set_type,
-                                       slot_list, label_maps, append_history,
-                                       use_history_labels, swap_utterances,
-                                       label_value_repetitions,
-                                       delexicalize_sys_utts, unk_token,
-                                       analyze)
-
-        return example
-
-    def create_examples(self,
-                        input_file,
-                        acts_file,
-                        set_type,
-                        slot_list,
-                        label_maps={},
-                        append_history=False,
-                        use_history_labels=False,
-                        swap_utterances=False,
-                        label_value_repetitions=False,
-                        delexicalize_sys_utts=False,
-                        unk_token='[UNK]',
-                        analyze=False):
-        """Read a DST json file into a list of DSTExample."""
-
-        sys_inform_dict = self.load_acts(acts_file)
-
-        with open(input_file, 'r', encoding='utf-8') as reader:
-            input_data = json.load(reader)
-
-        self.LABEL_MAPS = label_maps
-
-        examples = []
-        for dialog_id in tqdm(input_data):
-            entry = input_data[dialog_id]
-            utterances = entry['log']
-
-            example = self._create_example(
-                utterances, sys_inform_dict, set_type, slot_list, label_maps,
-                append_history, use_history_labels, swap_utterances,
-                label_value_repetitions, delexicalize_sys_utts, unk_token,
-                analyze)
-            examples.append(example)
-
-        return examples
-
-
-class DSTExample(object):
-    """
-    A single training/test example for the DST dataset.
-    """
-
-    def __init__(self,
-                 guid,
-                 text_a,
-                 text_b,
-                 history,
-                 text_a_label=None,
-                 text_b_label=None,
-                 history_label=None,
-                 values=None,
-                 inform_label=None,
-                 inform_slot_label=None,
-                 refer_label=None,
-                 diag_state=None,
-                 class_label=None):
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.history = history
-        self.text_a_label = text_a_label
-        self.text_b_label = text_b_label
-        self.history_label = history_label
-        self.values = values
-        self.inform_label = inform_label
-        self.inform_slot_label = inform_slot_label
-        self.refer_label = refer_label
-        self.diag_state = diag_state
-        self.class_label = class_label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = ''
-        s += 'guid: %s' % (self.guid)
-        s += ', text_a: %s' % (self.text_a)
-        s += ', text_b: %s' % (self.text_b)
-        s += ', history: %s' % (self.history)
-        if self.text_a_label:
-            s += ', text_a_label: %d' % (self.text_a_label)
-        if self.text_b_label:
-            s += ', text_b_label: %d' % (self.text_b_label)
-        if self.history_label:
-            s += ', history_label: %d' % (self.history_label)
-        if self.values:
-            s += ', values: %d' % (self.values)
-        if self.inform_label:
-            s += ', inform_label: %d' % (self.inform_label)
-        if self.inform_slot_label:
-            s += ', inform_slot_label: %d' % (self.inform_slot_label)
-        if self.refer_label:
-            s += ', refer_label: %d' % (self.refer_label)
-        if self.diag_state:
-            s += ', diag_state: %d' % (self.diag_state)
-        if self.class_label:
-            s += ', class_label: %d' % (self.class_label)
-        return s
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self,
-                 input_ids,
-                 input_ids_unmasked,
-                 input_mask,
-                 segment_ids,
-                 start_pos=None,
-                 end_pos=None,
-                 values=None,
-                 inform=None,
-                 inform_slot=None,
-                 refer_id=None,
-                 diag_state=None,
-                 class_label_id=None,
-                 guid='NONE'):
-        self.guid = guid
-        self.input_ids = input_ids
-        self.input_ids_unmasked = input_ids_unmasked
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_pos = start_pos
-        self.end_pos = end_pos
-        self.values = values
-        self.inform = inform
-        self.inform_slot = inform_slot
-        self.refer_id = refer_id
-        self.diag_state = diag_state
-        self.class_label_id = class_label_id
-
-
-def convert_examples_to_features(examples,
-                                 slot_list,
-                                 class_types,
-                                 model_type,
-                                 tokenizer,
-                                 max_seq_length,
-                                 slot_value_dropout=0.0):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    if model_type == 'bert':
-        model_specs = {
-            'MODEL_TYPE': 'bert',
-            'CLS_TOKEN': '[CLS]',
-            'UNK_TOKEN': '[UNK]',
-            'SEP_TOKEN': '[SEP]',
-            'TOKEN_CORRECTION': 4
-        }
-    else:
-        logger.error('Unknown model type (%s). Aborting.' % (model_type))
-        exit(1)
-
-    def _tokenize_text_and_label(text, text_label_dict, slot, tokenizer,
-                                 model_specs, slot_value_dropout):
-        joint_text_label = [0 for _ in text_label_dict[slot]
-                            ]  # joint all slots' label
-        for slot_text_label in text_label_dict.values():
-            for idx, label in enumerate(slot_text_label):
-                if label == 1:
-                    joint_text_label[idx] = 1
-
-        text_label = text_label_dict[slot]
-        tokens = []
-        tokens_unmasked = []
-        token_labels = []
-        for token, token_label, joint_label in zip(text, text_label,
-                                                   joint_text_label):
-            token = convert_to_unicode(token)
-            sub_tokens = tokenizer.tokenize(token)  # Most time intensive step
-            tokens_unmasked.extend(sub_tokens)
-            if slot_value_dropout == 0.0 or joint_label == 0:
-                tokens.extend(sub_tokens)
-            else:
-                rn_list = np.random.random_sample((len(sub_tokens), ))
-                for rn, sub_token in zip(rn_list, sub_tokens):
-                    if rn > slot_value_dropout:
-                        tokens.append(sub_token)
-                    else:
-                        tokens.append(model_specs['UNK_TOKEN'])
-            token_labels.extend([token_label for _ in sub_tokens])
-        assert len(tokens) == len(token_labels)
-        assert len(tokens_unmasked) == len(token_labels)
-        return tokens, tokens_unmasked, token_labels
-
-    def _truncate_seq_pair(tokens_a, tokens_b, history, max_length):
-        """Truncates a sequence pair in place to the maximum length.
-        Copied from bert/run_classifier.py
-        """
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
-        while True:
-            total_length = len(tokens_a) + len(tokens_b) + len(history)
-            if total_length <= max_length:
-                break
-            if len(history) > 0:
-                history.pop()
-            elif len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-
-    def _truncate_length_and_warn(tokens_a, tokens_b, history, max_seq_length,
-                                  model_specs, guid):
-        # Modifies `tokens_a` and `tokens_b` in place so that the total
-        # length is less than the specified length.
-        # Account for [CLS], [SEP], [SEP], [SEP] with "- 4" (BERT)
-        if len(tokens_a) + len(tokens_b) + len(
-                history) > max_seq_length - model_specs['TOKEN_CORRECTION']:
-            logger.info('Truncate Example %s. Total len=%d.' %
-                        (guid, len(tokens_a) + len(tokens_b) + len(history)))
-            input_text_too_long = True
-        else:
-            input_text_too_long = False
-        _truncate_seq_pair(tokens_a, tokens_b, history,
-                           max_seq_length - model_specs['TOKEN_CORRECTION'])
-        return input_text_too_long
-
-    def _get_token_label_ids(token_labels_a, token_labels_b,
-                             token_labels_history, max_seq_length,
-                             model_specs):
-        token_label_ids = []
-        token_label_ids.append(0)  # [CLS]
-        for token_label in token_labels_a:
-            token_label_ids.append(token_label)
-        token_label_ids.append(0)  # [SEP]
-        for token_label in token_labels_b:
-            token_label_ids.append(token_label)
-        token_label_ids.append(0)  # [SEP]
-        for token_label in token_labels_history:
-            token_label_ids.append(token_label)
-        token_label_ids.append(0)  # [SEP]
-        while len(token_label_ids) < max_seq_length:
-            token_label_ids.append(0)  # padding
-        assert len(token_label_ids) == max_seq_length
-        return token_label_ids
-
-    def _get_start_end_pos(class_type, token_label_ids, max_seq_length):
-        if class_type == 'copy_value' and 1 not in token_label_ids:
-            # logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.")
-            class_type = 'none'
-        start_pos = 0
-        end_pos = 0
-        if 1 in token_label_ids:
-            start_pos = token_label_ids.index(1)
-            # Parsing is supposed to find only first location of wanted value
-            if 0 not in token_label_ids[start_pos:]:
-                end_pos = len(token_label_ids[start_pos:]) + start_pos - 1
-            else:
-                end_pos = token_label_ids[start_pos:].index(0) + start_pos - 1
-            for i in range(max_seq_length):
-                if i >= start_pos and i <= end_pos:
-                    assert token_label_ids[i] == 1
-        return class_type, start_pos, end_pos
-
-    def _get_transformer_input(tokens_a, tokens_b, history, max_seq_length,
-                               tokenizer, model_specs):
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0     0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        segment_ids = []
-        tokens.append(model_specs['CLS_TOKEN'])
-        segment_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append(model_specs['SEP_TOKEN'])
-        segment_ids.append(0)
-        for token in tokens_b:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append(model_specs['SEP_TOKEN'])
-        segment_ids.append(1)
-        for token in history:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append(model_specs['SEP_TOKEN'])
-        segment_ids.append(1)
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < max_seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            segment_ids.append(0)
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-        return tokens, input_ids, input_mask, segment_ids
-
-    total_cnt = 0
-    too_long_cnt = 0
-
-    refer_list = ['none'] + slot_list
-
-    features = []
-    # Convert single example
-    for (example_index, example) in enumerate(examples):
-        if example_index % 1000 == 0:
-            logger.info('Writing example %d of %d' %
-                        (example_index, len(examples)))
-
-        total_cnt += 1
-
-        value_dict = {}
-        inform_dict = {}
-        inform_slot_dict = {}
-        refer_id_dict = {}
-        diag_state_dict = {}
-        class_label_id_dict = {}
-        start_pos_dict = {}
-        end_pos_dict = {}
-        for slot in slot_list:
-            tokens_a, tokens_a_unmasked, token_labels_a = _tokenize_text_and_label(
-                example.text_a, example.text_a_label, slot, tokenizer,
-                model_specs, slot_value_dropout)
-            tokens_b, tokens_b_unmasked, token_labels_b = _tokenize_text_and_label(
-                example.text_b, example.text_b_label, slot, tokenizer,
-                model_specs, slot_value_dropout)
-            tokens_history, tokens_history_unmasked, token_labels_history = _tokenize_text_and_label(
-                example.history, example.history_label, slot, tokenizer,
-                model_specs, slot_value_dropout)
-
-            input_text_too_long = _truncate_length_and_warn(
-                tokens_a, tokens_b, tokens_history, max_seq_length,
-                model_specs, example.guid)
-
-            if input_text_too_long:
-                if example_index < 10:
-                    if len(token_labels_a) > len(tokens_a):
-                        logger.info('    tokens_a truncated labels: %s'
-                                    % str(token_labels_a[len(tokens_a):]))
-                    if len(token_labels_b) > len(tokens_b):
-                        logger.info('    tokens_b truncated labels: %s'
-                                    % str(token_labels_b[len(tokens_b):]))
-                    if len(token_labels_history) > len(tokens_history):
-                        logger.info(
-                            '    tokens_history truncated labels: %s'
-                            % str(token_labels_history[len(tokens_history):]))
-
-                token_labels_a = token_labels_a[:len(tokens_a)]
-                token_labels_b = token_labels_b[:len(tokens_b)]
-                token_labels_history = token_labels_history[:len(tokens_history
-                                                                 )]
-                tokens_a_unmasked = tokens_a_unmasked[:len(tokens_a)]
-                tokens_b_unmasked = tokens_b_unmasked[:len(tokens_b)]
-                tokens_history_unmasked = tokens_history_unmasked[:len(
-                    tokens_history)]
-
-            assert len(token_labels_a) == len(tokens_a)
-            assert len(token_labels_b) == len(tokens_b)
-            assert len(token_labels_history) == len(tokens_history)
-            assert len(token_labels_a) == len(tokens_a_unmasked)
-            assert len(token_labels_b) == len(tokens_b_unmasked)
-            assert len(token_labels_history) == len(tokens_history_unmasked)
-            token_label_ids = _get_token_label_ids(token_labels_a,
-                                                   token_labels_b,
-                                                   token_labels_history,
-                                                   max_seq_length, model_specs)
-
-            value_dict[slot] = example.values[slot]
-            inform_dict[slot] = example.inform_label[slot]
-
-            class_label_mod, start_pos_dict[slot], end_pos_dict[
-                slot] = _get_start_end_pos(example.class_label[slot],
-                                           token_label_ids, max_seq_length)
-            if class_label_mod != example.class_label[slot]:
-                example.class_label[slot] = class_label_mod
-            inform_slot_dict[slot] = example.inform_slot_label[slot]
-            refer_id_dict[slot] = refer_list.index(example.refer_label[slot])
-            diag_state_dict[slot] = class_types.index(example.diag_state[slot])
-            class_label_id_dict[slot] = class_types.index(
-                example.class_label[slot])
-
-        if input_text_too_long:
-            too_long_cnt += 1
-
-        tokens, input_ids, input_mask, segment_ids = _get_transformer_input(
-            tokens_a, tokens_b, tokens_history, max_seq_length, tokenizer,
-            model_specs)
-        if slot_value_dropout > 0.0:
-            _, input_ids_unmasked, _, _ = _get_transformer_input(
-                tokens_a_unmasked, tokens_b_unmasked, tokens_history_unmasked,
-                max_seq_length, tokenizer, model_specs)
-        else:
-            input_ids_unmasked = input_ids
-
-        assert (len(input_ids) == len(input_ids_unmasked))
-
-        if example_index < 10:
-            logger.info('*** Example ***')
-            logger.info('guid: %s' % (example.guid))
-            logger.info('tokens: %s' % ' '.join(tokens))
-            logger.info('input_ids: %s' % ' '.join([str(x)
-                                                    for x in input_ids]))
-            logger.info('input_mask: %s'
-                        % ' '.join([str(x) for x in input_mask]))
-            logger.info('segment_ids: %s'
-                        % ' '.join([str(x) for x in segment_ids]))
-            logger.info('start_pos: %s' % str(start_pos_dict))
-            logger.info('end_pos: %s' % str(end_pos_dict))
-            logger.info('values: %s' % str(value_dict))
-            logger.info('inform: %s' % str(inform_dict))
-            logger.info('inform_slot: %s' % str(inform_slot_dict))
-            logger.info('refer_id: %s' % str(refer_id_dict))
-            logger.info('diag_state: %s' % str(diag_state_dict))
-            logger.info('class_label_id: %s' % str(class_label_id_dict))
-
-        features.append(
-            InputFeatures(
-                guid=example.guid,
-                input_ids=input_ids,
-                input_ids_unmasked=input_ids_unmasked,
-                input_mask=input_mask,
-                segment_ids=segment_ids,
-                start_pos=start_pos_dict,
-                end_pos=end_pos_dict,
-                values=value_dict,
-                inform=inform_dict,
-                inform_slot=inform_slot_dict,
-                refer_id=refer_id_dict,
-                diag_state=diag_state_dict,
-                class_label_id=class_label_id_dict))
-
-    logger.info('========== %d out of %d examples have text too long' %
-                (too_long_cnt, total_cnt))
-
-    return features
-
-
-# From bert.tokenization (TF code)
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode('utf-8', 'ignore')
-        else:
-            raise ValueError('Unsupported string type: %s' % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode('utf-8', 'ignore')
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError('Unsupported string type: %s' % (type(text)))
-    else:
-        raise ValueError('Not running on Python2 or Python 3?')
-
-
-if __name__ == '__main__':
-    processor = multiwoz22Processor()
-    set_type = 'test'
-    slot_list = [
-        'taxi-leaveAt', 'taxi-destination', 'taxi-departure', 'taxi-arriveBy',
-        'restaurant-book_people', 'restaurant-book_day',
-        'restaurant-book_time', 'restaurant-food', 'restaurant-pricerange',
-        'restaurant-name', 'restaurant-area', 'hotel-book_people',
-        'hotel-book_day', 'hotel-book_stay', 'hotel-name', 'hotel-area',
-        'hotel-parking', 'hotel-pricerange', 'hotel-stars', 'hotel-internet',
-        'hotel-type', 'attraction-type', 'attraction-name', 'attraction-area',
-        'train-book_people', 'train-leaveAt', 'train-destination', 'train-day',
-        'train-arriveBy', 'train-departure'
-    ]
-    append_history = True
-    use_history_labels = True
-    swap_utterances = True
-    label_value_repetitions = True
-    delexicalize_sys_utts = True,
-    unk_token = '[UNK]'
-    analyze = False
-    example = processor.create_example(utter1, history_states1, set_type,
-                                       slot_list, {}, append_history,
-                                       use_history_labels, swap_utterances,
-                                       label_value_repetitions,
-                                       delexicalize_sys_utts, unk_token,
-                                       analyze)
-    print(f'utterances is {example}')
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index dbfe5ba7..d914489c 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
                      MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
     from .multi_modal import CLIPTrainer
     from .nlp import SequenceClassificationTrainer, TextRankingTrainer
-    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer
+    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer, NlpTrainerArguments
     from .trainer import EpochBasedTrainer
 
 else:
@@ -27,7 +27,8 @@ else:
         ],
         'multi_modal': ['CLIPTrainer'],
         'nlp': ['SequenceClassificationTrainer', 'TextRankingTrainer'],
-        'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
+        'nlp_trainer':
+        ['NlpEpochBasedTrainer', 'VecoTrainer', 'NlpTrainerArguments'],
         'trainer': ['EpochBasedTrainer']
     }
 
diff --git a/modelscope/trainers/default_config.py b/modelscope/trainers/default_config.py
index c8f0c7b0..a02478b9 100644
--- a/modelscope/trainers/default_config.py
+++ b/modelscope/trainers/default_config.py
@@ -22,7 +22,8 @@ def merge_cfg(cfg: Config):
 
     This function will pop the default CheckpointHook when the BestCkptSaverHook exists in the input cfg.
 
-    @param cfg: The input cfg to be merged into.
+    Aegs:
+        cfg: The input cfg to be merged into.
     """
     cfg.merge_from_dict(DEFAULT_CONFIG, force=False)
     # pop duplicate hook
diff --git a/modelscope/trainers/hooks/lr_scheduler_hook.py b/modelscope/trainers/hooks/lr_scheduler_hook.py
index 32fb0250..ed018fef 100644
--- a/modelscope/trainers/hooks/lr_scheduler_hook.py
+++ b/modelscope/trainers/hooks/lr_scheduler_hook.py
@@ -47,7 +47,8 @@ class LrSchedulerHook(Hook):
         return lr
 
     def before_train_iter(self, trainer):
-        if not self.by_epoch and trainer.iter > 0:
+        if not self.by_epoch and trainer.iter >= getattr(
+                trainer, 'cumulative_iters', 1):
             if self.warmup_lr_scheduler is not None:
                 self.warmup_lr_scheduler.step()
             else:
diff --git a/modelscope/trainers/hooks/optimizer/base.py b/modelscope/trainers/hooks/optimizer/base.py
index 8c61dfdb..0f38c67a 100644
--- a/modelscope/trainers/hooks/optimizer/base.py
+++ b/modelscope/trainers/hooks/optimizer/base.py
@@ -44,6 +44,7 @@ class OptimizerHook(Hook):
 
     def before_run(self, trainer):
         trainer.optimizer.zero_grad()
+        trainer.cumulative_iters = self.cumulative_iters
 
     def after_train_iter(self, trainer):
         for k in self.loss_keys:
diff --git a/modelscope/trainers/nlp/__init__.py b/modelscope/trainers/nlp/__init__.py
index 7f1bcd63..22f2cfe6 100644
--- a/modelscope/trainers/nlp/__init__.py
+++ b/modelscope/trainers/nlp/__init__.py
@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .sequence_classification_trainer import SequenceClassificationTrainer
     from .csanmt_translation_trainer import CsanmtTranslationTrainer
-    from .text_ranking_trainer import TextRankingTranier
+    from .text_ranking_trainer import TextRankingTrainer
 else:
     _import_structure = {
         'sequence_classification_trainer': ['SequenceClassificationTrainer'],
diff --git a/modelscope/trainers/nlp/space/dialog_intent_trainer.py b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
index 2e59cd80..4baaddfe 100644
--- a/modelscope/trainers/nlp/space/dialog_intent_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_intent_trainer.py
@@ -1,23 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-import time
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional
 
 import numpy as np
 
 from modelscope.metainfo import Trainers
 from modelscope.models.nlp.space.model.generator import SpaceGenerator
 from modelscope.models.nlp.space.model.model_base import SpaceModelBase
-from modelscope.preprocessors.space.data_loader import \
+from modelscope.preprocessors.nlp.space.data_loader import \
     get_sequential_data_loader
-from modelscope.preprocessors.space.fields.intent_field import \
+from modelscope.preprocessors.nlp.space.fields.intent_field import \
     IntentBPETextField
-from modelscope.preprocessors.space.preprocess import intent_preprocess
+from modelscope.preprocessors.nlp.space.preprocess import intent_preprocess
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp.space.trainer.intent_trainer import IntentTrainer
-from modelscope.utils.config import Config
+from modelscope.utils.config import Config, ModelFile
 from modelscope.utils.logger import get_logger
 
 PATH = None
@@ -34,14 +33,6 @@ class DialogIntentTrainer(BaseTrainer):
                  **kwargs):
         super().__init__(os.path.join(kwargs['model_dir'], kwargs['cfg_name']))
 
-        def to_tensor(array):
-            """
-            numpy array -> tensor
-            """
-            import torch
-            array = torch.tensor(array)
-            return array.cuda() if self.cfg.use_gpu else array
-
         def setup_seed(seed):
             import random
             import torch
@@ -59,56 +50,70 @@ class DialogIntentTrainer(BaseTrainer):
         # preprocess data
         intent_preprocess(self.cfg.Model.init_checkpoint, self.cfg)
         # set reader and evaluator
-        bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
+        self.bpe = IntentBPETextField(self.cfg.Model.init_checkpoint, self.cfg)
 
-        self.cfg.Model.num_token_embeddings = bpe.vocab_size
-        self.cfg.Model.num_turn_embeddings = bpe.max_ctx_turn + 1
+        self.cfg.Model.num_token_embeddings = self.bpe.vocab_size
+        self.cfg.Model.num_turn_embeddings = self.bpe.max_ctx_turn + 1
         dataset_paths = [
             os.path.join(self.cfg.Dataset.data_dir,
                          self.cfg.Dataset.trigger_data)
         ]
         # set data and data status
-        collate_fn = bpe.collate_fn_multi_turn
+        collate_fn = self.bpe.collate_fn_multi_turn
         self.train_label_loader = get_sequential_data_loader(
             batch_size=self.cfg.Trainer.batch_size_label,
-            reader=bpe,
+            reader=self.bpe,
             hparams=self.cfg,
             data_paths=dataset_paths,
             collate_fn=collate_fn,
             data_type='train')
         self.valid_label_loader = get_sequential_data_loader(
             batch_size=self.cfg.Trainer.batch_size_label,
-            reader=bpe,
+            reader=self.bpe,
             hparams=self.cfg,
             data_paths=dataset_paths,
             collate_fn=collate_fn,
             data_type='valid')
         self.test_label_loader = get_sequential_data_loader(
             batch_size=self.cfg.Trainer.batch_size_label,
-            reader=bpe,
+            reader=self.bpe,
             hparams=self.cfg,
             data_paths=dataset_paths,
             collate_fn=collate_fn,
             data_type='test')
 
         # set generator
-        generator = SpaceGenerator.create(self.cfg, reader=bpe)
+        self.generator = SpaceGenerator.create(self.cfg, reader=self.bpe)
+        self._load_model(**kwargs)
+
+    def _load_model(self, **kwargs):
+
+        def to_tensor(array):
+            """
+            numpy array -> tensor
+            """
+            import torch
+            array = torch.tensor(array)
+            return array.cuda() if self.cfg.use_gpu else array
+
         # construct model
-        self.model = SpaceModelBase.create(
-            self.cfg.Model.init_checkpoint,
-            self.cfg,
-            reader=bpe,
-            generator=generator)
+        if 'model' in kwargs:
+            self.model = kwargs['model']
+        else:
+            self.model = SpaceModelBase.create(
+                kwargs['model_dir'],
+                self.cfg,
+                reader=self.bpe,
+                generator=self.generator)
 
         import torch
-
         # multi-gpu
         if self.cfg.Trainer.gpu > 1 and torch.cuda.device_count() > 1:
             self.model = torch.nn.DataParallel(self.model)
 
         # construct trainer
         self.trainer = IntentTrainer(
-            self.model, to_tensor, self.cfg, reader=bpe)
+            self.model, to_tensor, self.cfg, reader=self.bpe)
         num_batches = len(self.train_label_loader)
         self.trainer.set_optimizers(num_training_steps_per_epoch=num_batches)
         # load model, optimizer and lr_scheduler
@@ -131,6 +136,16 @@ class DialogIntentTrainer(BaseTrainer):
                  *args,
                  **kwargs) -> Dict[str, float]:
         logger.info('Evaluate')
+        self.cfg.do_infer = True
+
+        # get best checkpoint path
+        pos = checkpoint_path.rfind('/')
+        checkpoint_name = checkpoint_path[pos + 1:]
+        checkpoint_dir = checkpoint_path[:pos]
+
+        assert checkpoint_name == ModelFile.TORCH_MODEL_BIN_FILE
+        kwargs['model_dir'] = checkpoint_dir
+        self._load_model(**kwargs)
         self.trainer.infer(
             data_iter=self.test_label_loader,
             ex_data_iter=self.train_label_loader)
diff --git a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
index 726404d4..aa6bb69d 100644
--- a/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
+++ b/modelscope/trainers/nlp/space/dialog_modeling_trainer.py
@@ -9,8 +9,7 @@ import numpy as np
 from modelscope.metainfo import Trainers
 from modelscope.models.nlp.space.model.generator import SpaceGenerator
 from modelscope.models.nlp.space.model.model_base import SpaceModelBase
-from modelscope.preprocessors.space.fields.gen_field import \
-    MultiWOZBPETextField
+from modelscope.preprocessors.nlp import MultiWOZBPETextField
 from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp.space.eval import MultiWOZEvaluator
diff --git a/modelscope/trainers/nlp/space/trainer/gen_trainer.py b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
index 34cd2f9b..05efa138 100644
--- a/modelscope/trainers/nlp/space/trainer/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/gen_trainer.py
@@ -1,9 +1,6 @@
-"""
-Trainer class.
-"""
-import logging
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
-import sys
 import time
 from collections import OrderedDict
 
@@ -61,7 +58,7 @@ class Trainer(object):
         self.evaluator = evaluator
         self.tokenizer = reader.tokenizer
 
-        self.logger = get_logger()
+        self.logger = logger or get_logger()
 
         self.batch_metrics_tracker = MetricsTracker()
         self.token_metrics_tracker = MetricsTracker()
diff --git a/modelscope/trainers/nlp/space/trainer/intent_trainer.py b/modelscope/trainers/nlp/space/trainer/intent_trainer.py
index 1e6f4a2d..dc6b317b 100644
--- a/modelscope/trainers/nlp/space/trainer/intent_trainer.py
+++ b/modelscope/trainers/nlp/space/trainer/intent_trainer.py
@@ -1,10 +1,6 @@
-"""
-Trainer class.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
-import logging
 import os
-import sys
 import time
 from collections import OrderedDict
 
@@ -16,24 +12,8 @@ from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
 from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
     MetricsTracker
-
-
-def get_logger(log_path, name='default'):
-    logger = logging.getLogger(name)
-    logger.propagate = False
-    logger.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter('%(message)s')
-
-    sh = logging.StreamHandler(sys.stdout)
-    sh.setFormatter(formatter)
-    logger.addHandler(sh)
-
-    fh = logging.FileHandler(log_path, mode='w')
-    fh.setFormatter(formatter)
-    logger.addHandler(fh)
-
-    return logger
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
 
 
 class Trainer(object):
@@ -76,11 +56,7 @@ class Trainer(object):
         self.lr_scheduler = lr_scheduler
         self.optimizer = optimizer
 
-        # if not os.path.exists(self.save_dir):
-        #     os.makedirs(self.save_dir)
-
-        # self.logger = logger or get_logger(os.path.join(self.save_dir, "trainer.log"), "trainer")
-        self.logger = logger or get_logger('trainer.log', 'trainer')
+        self.logger = logger or get_logger()
 
         self.batch_metrics_tracker_label = MetricsTracker()
         self.token_metrics_tracker_label = MetricsTracker()
@@ -201,9 +177,12 @@ class Trainer(object):
 
         # Save current best model
         if is_best:
-            best_model_file = os.path.join(self.save_dir, 'best.model')
+            best_model_file = os.path.join(self.save_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
             torch.save(self.model.state_dict(), best_model_file)
-            best_train_file = os.path.join(self.save_dir, 'best.train')
+            best_train_file = os.path.join(
+                self.save_dir,
+                '{}.train'.format(ModelFile.TORCH_MODEL_BIN_FILE))
             torch.save(train_state, best_train_file)
             self.logger.info(
                 f"Saved best model state to '{best_model_file}' with new best valid metric "
@@ -215,7 +194,7 @@ class Trainer(object):
 
         def _load_model_state():
             model_state_dict = torch.load(
-                f'{self.func_model.init_checkpoint}.model',
+                f'{self.func_model.init_checkpoint}',
                 map_location=lambda storage, loc: storage)
 
             if 'module.' in list(model_state_dict.keys())[0]:
@@ -303,8 +282,13 @@ class Trainer(object):
             self.logger.info('Loaded no model !!!')
             return
 
-        _load_model_state()
-        _load_train_state()
+        if self.do_train:
+            _load_model_state()
+            return
+
+        if self.do_infer:
+            _load_model_state()
+            _load_train_state()
 
 
 class IntentTrainer(Trainer):
@@ -719,104 +703,3 @@ class IntentTrainer(Trainer):
 
         assert 'loss' in metrics
         return metrics['loss'], metrics
-
-    def load(self):
-        """ load """
-
-        def _load_model_state():
-            model_state_dict = torch.load(
-                f'{self.func_model.init_checkpoint}',
-                map_location=lambda storage, loc: storage)
-
-            if 'module.' in list(model_state_dict.keys())[0]:
-                new_model_state_dict = OrderedDict()
-                for k, v in model_state_dict.items():
-                    assert k[:7] == 'module.'
-                    new_model_state_dict[k[7:]] = v
-                model_state_dict = new_model_state_dict
-
-            new_model_state_dict = OrderedDict()
-            parameters = {
-                name: param
-                for name, param in self.func_model.named_parameters()
-            }
-            for name, param in model_state_dict.items():
-                if name in parameters:
-                    if param.shape != parameters[name].shape:
-                        assert hasattr(param, 'numpy')
-                        arr = param.numpy()
-                        z = np.random.normal(
-                            scale=self.func_model.initializer_range,
-                            size=parameters[name].shape).astype('float32')
-                        if name == 'embedder.token_embedding.weight':
-                            z[-param.shape[0]:] = arr
-                            print(
-                                f'part of parameter({name}) random normlize initialize'
-                            )
-                        else:
-                            if z.shape[0] < param.shape[0]:
-                                z = arr[:z.shape[0]]
-                                print(f'part of parameter({name}) are dropped')
-                            else:
-                                z[:param.shape[0]] = arr
-                                print(
-                                    f'part of parameter({name}) random normlize initialize'
-                                )
-                        dtype, device = param.dtype, param.device
-                        z = torch.tensor(z, dtype=dtype, device=device)
-                        new_model_state_dict[name] = z
-                    else:
-                        new_model_state_dict[name] = param
-                else:
-                    print(f'parameter({name}) are dropped')
-            model_state_dict = new_model_state_dict
-
-            for name in parameters:
-                if name not in model_state_dict:
-                    if parameters[name].requires_grad:
-                        print(f'parameter({name}) random normlize initialize')
-                        z = np.random.normal(
-                            scale=self.func_model.initializer_range,
-                            size=parameters[name].shape).astype('float32')
-                        dtype, device = parameters[name].dtype, parameters[
-                            name].device
-                        model_state_dict[name] = torch.tensor(
-                            z, dtype=dtype, device=device)
-                    else:
-                        model_state_dict[name] = parameters[name]
-
-            self.func_model.load_state_dict(model_state_dict)
-            self.logger.info(
-                f"Loaded model state from '{self.func_model.init_checkpoint}.model'"
-            )
-
-        def _load_train_state():
-            train_file = f'{self.func_model.init_checkpoint}.train'
-            if os.path.exists(train_file):
-                train_state_dict = torch.load(
-                    train_file, map_location=lambda storage, loc: storage)
-                self.epoch = train_state_dict['epoch']
-                self.best_valid_metric = train_state_dict['best_valid_metric']
-                if self.optimizer is not None and 'optimizer' in train_state_dict:
-                    self.optimizer.load_state_dict(
-                        train_state_dict['optimizer'])
-                if self.lr_scheduler is not None and 'lr_scheduler' in train_state_dict:
-                    self.lr_scheduler.load_state_dict(
-                        train_state_dict['lr_scheduler'])
-                self.logger.info(
-                    f"Loaded train state from '{train_file}' with (epoch-{self.epoch} "
-                    f'best_valid_metric={self.best_valid_metric:.3f})')
-            else:
-                self.logger.info('Loaded no train state')
-
-        if self.func_model.init_checkpoint is None:
-            self.logger.info('Loaded no model !!!')
-            return
-
-        if self.do_train:
-            _load_model_state()
-            return
-
-        if self.do_infer:
-            _load_model_state()
-            _load_train_state()
diff --git a/modelscope/trainers/nlp/text_ranking_trainer.py b/modelscope/trainers/nlp/text_ranking_trainer.py
index 5da9c76a..610c36b5 100644
--- a/modelscope/trainers/nlp/text_ranking_trainer.py
+++ b/modelscope/trainers/nlp/text_ranking_trainer.py
@@ -12,9 +12,9 @@ from tqdm import tqdm
 
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model, TorchModel
+from modelscope.models.nlp import BertForTextRanking
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.trainers.base import BaseTrainer
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
@@ -118,7 +118,6 @@ class TextRankingTrainer(NlpEpochBasedTrainer):
             Example:
             {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
         """
-        from modelscope.models.nlp import TextRanking
         # get the raw online dataset
         self.eval_dataloader = self._build_dataloader_with_dataset(
             self.eval_dataset,
@@ -127,7 +126,7 @@ class TextRankingTrainer(NlpEpochBasedTrainer):
         # generate a standard dataloader
         # generate a model
         if checkpoint_path is not None:
-            model = TextRanking.from_pretrained(checkpoint_path)
+            model = BertForTextRanking.from_pretrained(checkpoint_path)
         else:
             model = self.model
 
@@ -156,13 +155,16 @@ class TextRankingTrainer(NlpEpochBasedTrainer):
             with torch.no_grad():
                 label_ids = batch.pop('labels').detach().cpu().numpy()
                 qids = batch.pop('qid').detach().cpu().numpy()
-                outputs = model(batch)
+                outputs = model(**batch)
             infer_end_time = time.time()
             total_spent_time += infer_end_time - infer_start_time
             total_samples += self.eval_dataloader.batch_size
 
-            assert 'scores' in outputs
-            logits = outputs['scores']
+            def sigmoid(logits):
+                return np.exp(logits) / (1 + np.exp(logits))
+
+            logits = outputs['logits'].squeeze(-1).detach().cpu().numpy()
+            logits = sigmoid(logits).tolist()
 
             label_list.extend(label_ids)
             logits_list.extend(logits)
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index b54aa666..a19e7c7b 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -1,7 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from typing import Callable, Optional, Tuple, Union
+from copy import deepcopy
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -13,15 +15,416 @@ from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import build_metric
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets import MsDataset
-from modelscope.preprocessors import Preprocessor, build_preprocessor
-from modelscope.utils.config import Config
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ModeKeys,
-                                       ModelFile, Tasks)
+                                       ModelFile)
 from modelscope.utils.hub import parse_label_mapping
 from .base import TRAINERS
 from .trainer import EpochBasedTrainer
 
 
+@dataclass
+class NlpTrainerArguments:
+    """The arguments for the nlp trainer.
+
+    All the arguments listed here have None default values, which means follow the default value in the input
+    cfg dict.
+    """
+
+    work_dir: Optional[str] = field(
+        default=None, metadata={'help': 'The work dir(key: train.work_dir)'})
+
+    task: Optional[str] = field(
+        default=None, metadata={'help': 'The task type(key: task)'})
+
+    preprocessor_type: Optional[str] = field(
+        default=None,
+        metadata={'help': 'The preprocessor type(key: preprocessor.type)'})
+
+    train_first_sequence: str = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of first sentence for the training dataset(key:preprocessor.train.'
+            'first_sequence/dataset.train.first_sequence)'
+        })
+
+    train_second_sequence: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of second sentence for the training dataset(key:preprocessor.train.'
+            'second_sequence/dataset.train.second_sequence)'
+        })
+
+    train_label: str = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of label for the training dataset(key:preprocessor.train.'
+            'second_sequence/dataset.train.second_sequence)'
+        })
+
+    eval_first_sequence: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of first sentence for the eval dataset(key:preprocessor.val.'
+            'first_sequence/dataset.val.first_sequence), '
+            'if not provided, the trainer will use the train_first_sequence for evaluation'
+        })
+
+    eval_second_sequence: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of second sentence for the eval dataset(key:preprocessor.val.'
+            'second_sequence/dataset.val.second_sequence),'
+            'if not provided, the trainer will use the train_second_sequence for evaluation'
+        })
+
+    eval_label: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The key of label for the eval dataset(key:preprocessor.val.'
+            'second_sequence/dataset.val.second_sequence),'
+            'if not provided, the trainer will use the train_label for evaluation'
+        })
+
+    labels: Optional[List] = field(
+        default=None,
+        metadata={
+            'help':
+            'The labels list of the dataset(key:dataset.train.labels),'
+            'This parameter has the same effect with "label2id"'
+        })
+
+    max_epochs: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The max_epochs of the training loop(key: train.max_epochs)'
+        })
+
+    train_batch_size_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The train batch size per gpu(key: train.dataloader.batch_size_per_gpu)'
+        })
+
+    train_workers_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The number of workers per gpu(key: train.dataloader.workers_per_gpu)'
+        })
+
+    train_shuffle: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Shuffle the train dataset or not(key: train.dataloader.shuffle)'
+        })
+
+    eval_batch_size_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The eval batch size per gpu(key: evaluation.dataloader.batch_size_per_gpu)'
+        })
+
+    eval_workers_per_gpu: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The number of workers per gpu(key: evaluation.dataloader.workers_per_gpu)'
+        })
+
+    eval_shuffle: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Shuffle the eval dataset or not(key: evaluation.dataloader.shuffle)'
+        })
+
+    optimizer_args: Optional[Dict] = field(
+        default=None,
+        metadata={'help': 'The optimizer config dict(key: train.optimizer)'})
+
+    lr_scheduler_args: Optional[Dict] = field(
+        default=None,
+        metadata={
+            'help': 'The lr_scheduler config dict(key: train.lr_scheduler)'
+        })
+
+    checkpoint_saving_type: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The checkpoint saving type(key: The ckpt hook dict in train.hooks), '
+            'valid options: "BestCkptSaverHook", "CheckpointHook"'
+        })
+
+    checkpoint_by_epoch: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Saving checkpoint by epoch or not(key: The by_epoch key in '
+            'ckpt hook dict in train.hooks)'
+        })
+
+    checkpoint_interval: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The checkpoint saving interval(key: The interval key in '
+            'ckpt hook dict in train.hooks)'
+        })
+
+    metric_key: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The metric key for the BestCkptSaverHook(key: The metric_key key in '
+            'ckpt hook dict in train.hooks), if the checkpoint_saving_type is "CheckpointHook" or '
+            '"None", the metric_key key has no effects'
+        })
+
+    evaluation_type: Optional[str] = field(
+        default=None,
+        metadata={
+            'help':
+            'The evaluation type(key: The evaluation hook dict in train.hooks), '
+            'valid options: "EvaluationHook", "None"'
+        })
+
+    evaluation_by_epoch: Optional[bool] = field(
+        default=None,
+        metadata={
+            'help':
+            'Evaluating by epoch or not(key: The by_epoch key in '
+            'evaluation hook dict in train.hooks)'
+        })
+
+    evaluation_interval: Optional[int] = field(
+        default=None,
+        metadata={
+            'help':
+            'The evaluating interval(key: The interval key in '
+            'evaluation hook dict in train.hooks)'
+        })
+
+    metrics: Optional[List[str]] = field(
+        default=None,
+        metadata={'help': 'The metrics class keys(key: evaluation.metrics)'})
+
+    default_train_config = ConfigDict({
+        'work_dir':
+        '/tmp',
+        'max_epochs':
+        5,
+        'dataloader': {
+            'batch_size_per_gpu': 32,
+            'workers_per_gpu': 0
+        },
+        'optimizer': {
+            'type': 'AdamW',
+            'lr': 2e-5,
+            'options': {}
+        },
+        'lr_scheduler': {
+            'type': 'LinearLR',
+            'start_factor': 1.0,
+            'end_factor': 0.0,
+            'total_iters': 10000,
+            'options': {
+                'by_epoch': False
+            }
+        },
+        'hooks': [{
+            'type': 'CheckpointHook',
+            'by_epoch': False,
+            'interval': 100
+        }, {
+            'type': 'TextLoggerHook',
+            'interval': 1
+        }, {
+            'type': 'IterTimerHook'
+        }, {
+            'type': 'EvaluationHook',
+            'by_epoch': False,
+            'interval': 100
+        }]
+    })
+
+    def __call__(self, cfg):
+        """
+
+        Args:
+            cfg(`Config`): The cfg to be modified.
+
+        Returns:
+            The cfg after modification.
+        """
+
+        if self.task is not None:
+            cfg.task = self.task
+
+        if self.preprocessor_type is not None:
+            if not hasattr(cfg, 'preprocessor'):
+                cfg.preprocessor = ConfigDict()
+            cfg.preprocessor.type = self.preprocessor_type
+
+        if self.train_first_sequence is not None or self.train_second_sequence \
+                is not None or self.train_label is not None or self.labels is not None:
+            if not hasattr(cfg, 'dataset'):
+                cfg.dataset = ConfigDict()
+            if not hasattr(cfg.dataset, 'train'):
+                cfg.dataset.train = ConfigDict()
+            if self.train_first_sequence is not None:
+                cfg.dataset.train.first_sequence = self.train_first_sequence
+            if self.train_second_sequence is not None:
+                cfg.dataset.train.second_sequence = self.train_second_sequence
+            if self.train_label is not None:
+                cfg.dataset.train.label = self.train_label
+            if self.labels is not None:
+                cfg.dataset.train.labels = self.labels
+
+        if self.eval_first_sequence is not None or self.eval_second_sequence \
+                is not None or self.eval_label is not None:
+            if not hasattr(cfg, 'dataset'):
+                cfg.dataset = ConfigDict()
+            if not hasattr(cfg.dataset, 'val'):
+                cfg.dataset.val = ConfigDict()
+            if self.eval_first_sequence is not None:
+                cfg.dataset.val.first_sequence = self.eval_first_sequence
+            if self.eval_second_sequence is not None:
+                cfg.dataset.val.second_sequence = self.eval_second_sequence
+            if self.eval_label is not None:
+                cfg.dataset.val.label = self.eval_label
+
+        if self.max_epochs is not None or self.train_batch_size_per_gpu is not None \
+                or self.train_shuffle is not None or self.optimizer_args is not None \
+                or self.work_dir is not None or self.lr_scheduler_args is not None\
+                or self.train_workers_per_gpu is not None:
+            if not hasattr(cfg, 'train'):
+                cfg.train = deepcopy(self.default_train_config)
+            if not hasattr(cfg.train, 'dataloader'):
+                cfg.train.dataloader = deepcopy(
+                    self.default_train_config.dataloader)
+            if not hasattr(cfg.train, 'optimizer'):
+                cfg.train.optimizer = deepcopy(
+                    self.default_train_config.optimizer)
+            if not hasattr(cfg.train, 'lr_scheduler'):
+                cfg.train.lr_scheduler = deepcopy(
+                    self.default_train_config.lr_scheduler)
+            if self.work_dir is not None:
+                cfg.train.work_dir = self.work_dir
+            if self.max_epochs is not None:
+                cfg.train.max_epochs = self.max_epochs
+            if self.train_batch_size_per_gpu is not None:
+                cfg.train.dataloader.batch_size_per_gpu = self.train_batch_size_per_gpu
+            if self.train_workers_per_gpu is not None:
+                cfg.train.dataloader.workers_per_gpu = self.train_workers_per_gpu
+            if self.train_shuffle is not None:
+                cfg.train.dataloader.shuffle = self.train_shuffle
+            if self.optimizer_args is not None:
+                if cfg.train.optimizer.type != self.optimizer_args.get(
+                        'type', cfg.train.optimizer.type):
+                    cfg.train.optimizer = ConfigDict(
+                        deepcopy(self.optimizer_args))
+                else:
+                    cfg.train.optimizer = Config._merge_a_into_b(
+                        self.optimizer_args, cfg.train.optimizer, force=True)
+            if self.lr_scheduler_args is not None:
+                if cfg.train.lr_scheduler.type != self.lr_scheduler_args.get(
+                        'type', cfg.train.lr_scheduler.type):
+                    cfg.train.lr_scheduler = ConfigDict(
+                        deepcopy(self.lr_scheduler_args))
+                else:
+                    cfg.train.lr_scheduler = Config._merge_a_into_b(
+                        self.lr_scheduler_args,
+                        cfg.train.lr_scheduler,
+                        force=True)
+
+        if self.checkpoint_saving_type is not None or self.checkpoint_by_epoch is not None \
+                or self.checkpoint_interval is not None or self.metric_key is not None:
+            if not any([
+                    self.checkpoint_saving_type == hook['type']
+                    for hook in cfg.train.hooks
+            ]):
+                cfg.train.hooks = list(
+                    filter(
+                        lambda hook: hook['type'] not in
+                        ['CheckpointHook', 'BestCkptSaverHook'],
+                        cfg.train.hooks))
+                cfg.train.hooks.append(
+                    deepcopy(self.default_train_config.hooks[0]))
+                cfg.train.hooks[-1].type = self.checkpoint_saving_type
+            checkpoint_hook = list(
+                filter(
+                    lambda hook: hook[
+                        'type'] in ['CheckpointHook', 'BestCkptSaverHook'],
+                    cfg.train.hooks))[0]
+            if self.checkpoint_by_epoch is not None:
+                checkpoint_hook['by_epoch'] = self.checkpoint_by_epoch
+            if self.checkpoint_interval is not None:
+                checkpoint_hook['interval'] = self.checkpoint_interval
+            if checkpoint_hook['type'] == 'BestCkptSaverHook':
+                assert self.metric_key is not None, 'The metric_key must be provided ' \
+                                                    'if the ckpt saving hook is "BestCkptSaverHook"'
+                checkpoint_hook['metric_key'] = self.metric_key
+
+        if self.evaluation_type is not None or self.evaluation_by_epoch is not None \
+                or self.evaluation_interval is not None or self.eval_batch_size_per_gpu is not None or \
+                self.eval_shuffle is not None or self.metrics is not None:
+            if self.evaluation_type is not None and not any([
+                    self.evaluation_type == hook['type']
+                    for hook in cfg.train.hooks
+            ]):
+                cfg.train.hooks = list(
+                    filter(lambda hook: hook['type'] not in ['EvaluationHook'],
+                           cfg.train.hooks))
+                if self.evaluation_type != 'None':
+                    cfg.train.hooks.append(
+                        deepcopy(self.default_train_config.hooks[3]))
+                    cfg.train.hooks[-1].type = self.evaluation_type
+
+            evaluation_hook = list(
+                filter(lambda hook: hook['type'] in ['EvaluationHook'],
+                       cfg.train.hooks))
+            evaluation_hook = evaluation_hook[0] if len(
+                evaluation_hook) > 0 else None
+
+            if evaluation_hook is not None and self.evaluation_by_epoch is not None:
+                evaluation_hook['by_epoch'] = self.evaluation_by_epoch
+            if evaluation_hook is not None and self.evaluation_interval is not None:
+                evaluation_hook['interval'] = self.evaluation_interval
+
+            if not hasattr(cfg, 'evaluation'):
+                cfg.evaluation = ConfigDict({
+                    'dataloader': {
+                        'batch_size_per_gpu': 32,
+                        'workers_per_gpu': 0,
+                        'shuffle': False
+                    }
+                })
+
+            if self.metrics is not None:
+                cfg.evaluation.metrics = self.metrics
+            if self.eval_batch_size_per_gpu is not None:
+                cfg.evaluation.dataloader.batch_size_per_gpu = self.eval_batch_size_per_gpu
+            if self.eval_workers_per_gpu is not None:
+                cfg.evaluation.dataloader.workers_per_gpu = self.eval_workers_per_gpu
+            if self.eval_shuffle is not None:
+                cfg.evaluation.dataloader.shuffle = self.eval_shuffle
+
+        return cfg
+
+
 @TRAINERS.register_module(module_name=Trainers.nlp_base_trainer)
 class NlpEpochBasedTrainer(EpochBasedTrainer):
 
@@ -80,9 +483,10 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
                     model)
             else:
                 model_dir = snapshot_download(model, revision=model_revision)
-            cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+            if cfg_file is None:
+                cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
         else:
-            assert cfg_file is not None, 'Config file should not be None if model is an nn.Module class'
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
             model_dir = os.path.dirname(cfg_file)
 
         self.label2id = None
@@ -91,26 +495,17 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         self.cfg_modify_fn = cfg_modify_fn
         self.cfg = self.rebuild_config(Config.from_file(cfg_file))
 
-        label2id = parse_label_mapping(model_dir)
-        if label2id is not None:
-            self.label2id = label2id
-            self.id2label = {id: label for label, id in label2id.items()}
-            self.num_labels = len(label2id)
-        else:
-            try:
-                labels = self.cfg.dataset.train.labels
-                if labels is not None and len(labels) > 0:
-                    self.label2id = {
-                        label: idx
-                        for idx, label in enumerate(labels)
-                    }
-                    self.id2label = {
-                        idx: label
-                        for idx, label in enumerate(labels)
-                    }
-                    self.num_labels = len(labels)
-            except AttributeError:
-                pass
+        try:
+            labels = self.cfg.dataset.train.labels
+            self.label2id = {label: idx for idx, label in enumerate(labels)}
+            self.id2label = {idx: label for idx, label in enumerate(labels)}
+            self.num_labels = len(labels)
+        except AttributeError:
+            label2id = parse_label_mapping(model_dir)
+            if label2id is not None:
+                self.label2id = label2id
+                self.id2label = {id: label for label, id in label2id.items()}
+                self.num_labels = len(label2id)
 
         def build_dataset_keys(cfg):
             if cfg is not None:
@@ -185,36 +580,20 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             'label2id': self.label2id
         }
 
-        field_name = Tasks.find_field_by_task(self.cfg.task)
-        train_preprocessor, eval_preprocessor = None, None
-        _train_cfg, _eval_cfg = {}, {}
-
-        if 'type' not in self.cfg.preprocessor and (
-                'train' in self.cfg.preprocessor
-                or 'val' in self.cfg.preprocessor):
-            if 'train' in self.cfg.preprocessor:
-                _train_cfg = self.cfg.preprocessor.train
-            if 'val' in self.cfg.preprocessor:
-                _eval_cfg = self.cfg.preprocessor.val
-        else:
-            _train_cfg = self.cfg.preprocessor
-            _eval_cfg = self.cfg.preprocessor
-
-        if len(_train_cfg):
-            _train_cfg.update({
-                'model_dir': self.model_dir,
-                **model_args,
-                **self.train_keys, 'mode': ModeKeys.TRAIN
-            })
-            train_preprocessor = build_preprocessor(_train_cfg, field_name)
-        if len(_eval_cfg):
-            _eval_cfg.update({
-                'model_dir': self.model_dir,
-                **model_args,
-                **self.eval_keys, 'mode': ModeKeys.EVAL
-            })
-            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
-
+        train_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir,
+            cfg_dict=self.cfg,
+            preprocessor_mode=ModeKeys.TRAIN,
+            **model_args,
+            **self.train_keys,
+            mode=ModeKeys.TRAIN)
+        eval_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir,
+            cfg_dict=self.cfg,
+            preprocessor_mode=ModeKeys.EVAL,
+            **model_args,
+            **self.eval_keys,
+            mode=ModeKeys.EVAL)
         return train_preprocessor, eval_preprocessor
 
 
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 61d11aa6..0dc6ece4 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -4,7 +4,7 @@ import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
 from functools import partial
-from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import json
 import torch
@@ -22,18 +22,18 @@ from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.msdatasets.task_datasets.builder import build_task_dataset
 from modelscope.msdatasets.task_datasets.torch_base_dataset import \
     TorchTaskDataset
+from modelscope.outputs import ModelOutputBase
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import build_preprocessor
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
-                                       ConfigKeys, Hubs, ModeKeys, ModelFile,
-                                       Tasks, TrainerStages)
+                                       ConfigKeys, ModeKeys, ModelFile,
+                                       TrainerStages)
 from modelscope.utils.data_utils import to_device
-from modelscope.utils.device import create_device, verify_device
+from modelscope.utils.device import create_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
@@ -146,7 +146,8 @@ class EpochBasedTrainer(BaseTrainer):
             if ConfigKeys.val in preprocessor:
                 assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
                 self.eval_preprocessor = preprocessor[ConfigKeys.val]
-        elif hasattr(self.cfg, ConfigFields.preprocessor):
+        elif hasattr(self.cfg, ConfigFields.preprocessor
+                     ) and self.cfg.preprocessor is not None:
             self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
             )
 
@@ -344,23 +345,32 @@ class EpochBasedTrainer(BaseTrainer):
                         preprocessors=preprocessor) for d in datasets
                 ]
                 cfg = ConfigDict(
-                    type=self.cfg.task, mode=mode, datasets=datasets)
-                return build_task_dataset(cfg, self.cfg.task)
+                    type=self.cfg.model.type, mode=mode, datasets=datasets)
+                task_dataset = build_task_dataset(cfg, self.cfg.task)
+                task_dataset.trainer = self
+                return task_dataset
             else:
                 # avoid add no str value datasets, preprocessors in cfg
                 task_data_build_config = ConfigDict(
-                    mode=mode, datasets=datasets, preprocessor=preprocessor)
+                    type=self.cfg.model.type,
+                    mode=mode,
+                    datasets=datasets,
+                    preprocessor=preprocessor)
                 task_data_build_config.update(task_data_config)
-                return build_task_dataset(task_data_build_config,
-                                          self.cfg.task)
+                task_dataset = build_task_dataset(task_data_build_config,
+                                                  self.cfg.task)
+                task_dataset.trainer = self
+                return task_dataset
         except Exception:
             if isinstance(datasets, (List, Tuple)) or preprocessor is not None:
-                return TorchTaskDataset(
+                task_dataset = TorchTaskDataset(
                     datasets,
                     mode=mode,
                     preprocessor=preprocessor,
                     **(dict(type=self.cfg.model.type) if hasattr(
                         self.cfg, 'model') else {}))
+                task_dataset.trainer = self
+                return task_dataset
             else:
                 return datasets
 
@@ -372,35 +382,12 @@ class EpochBasedTrainer(BaseTrainer):
         Returns: The train preprocessor and eval preprocessor instance.
 
         """
-        field_name = Tasks.find_field_by_task(self.cfg.task)
-        train_preprocessor, eval_preprocessor = None, None
-        _train_cfg, _eval_cfg = {}, {}
-        _dafault_args = {'model_dir': self.model_dir}
-
-        if 'type' not in self.cfg.preprocessor and (
-                'train' in self.cfg.preprocessor
-                or 'val' in self.cfg.preprocessor):
-            if 'train' in self.cfg.preprocessor:
-                _train_cfg = self.cfg.preprocessor.train
-            if 'val' in self.cfg.preprocessor:
-                _eval_cfg = self.cfg.preprocessor.val
-        else:
-            _train_cfg = self.cfg.preprocessor
-            _eval_cfg = self.cfg.preprocessor
-
-        if len(_train_cfg):
-            if isinstance(_train_cfg, Sequence):
-                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
-                # and add mode for Compose or other plans
-                raise NotImplementedError('Not supported yet!')
-            _train_cfg.update(_dafault_args)
-            train_preprocessor = build_preprocessor(_train_cfg, field_name)
-        if len(_eval_cfg):
-            if isinstance(_eval_cfg, Sequence):
-                raise NotImplementedError('Not supported yet!')
-            _eval_cfg.update(_dafault_args)
-            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
-
+        train_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir,
+            cfg_dict=self.cfg,
+            preprocessor_mode=ModeKeys.TRAIN)
+        eval_preprocessor = Preprocessor.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, preprocessor_mode=ModeKeys.EVAL)
         return train_preprocessor, eval_preprocessor
 
     def get_metrics(self) -> List[Union[str, Dict]]:
@@ -547,6 +534,8 @@ class EpochBasedTrainer(BaseTrainer):
         else:
             train_outputs = model.forward(inputs)
 
+        if isinstance(train_outputs, ModelOutputBase):
+            train_outputs = train_outputs.to_dict()
         if not isinstance(train_outputs, dict):
             raise TypeError('"model.forward()" must return a dict')
 
@@ -650,8 +639,9 @@ class EpochBasedTrainer(BaseTrainer):
         """
         # TODO: support MsDataset load for cv
         if hasattr(data_cfg, 'name'):
+            dataset_name = data_cfg.pop('name')
             dataset = MsDataset.load(
-                dataset_name=data_cfg.pop('name'),
+                dataset_name=dataset_name,
                 **data_cfg,
             )
             cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py
index a9d7f396..2a7520f2 100644
--- a/modelscope/utils/checkpoint.py
+++ b/modelscope/utils/checkpoint.py
@@ -207,6 +207,6 @@ def save_pretrained(model,
     # Dump the config to the configuration.json
     if ConfigFields.pipeline not in config:
         config[ConfigFields.pipeline] = {'type': config[ConfigFields.task]}
-    cfg_str = json.dumps(config, cls=JSONIteratorEncoder)
+    cfg_str = json.dumps(config, indent=4, cls=JSONIteratorEncoder)
     config_file = os.path.join(target_folder, ModelFile.CONFIGURATION)
     storage.write(cfg_str.encode(), config_file)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 50a1c016..6a9d6fd5 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -115,7 +115,6 @@ class NLPTasks(object):
     dialog_intent_prediction = 'dialog-intent-prediction'
     dialog_state_tracking = 'dialog-state-tracking'
     table_question_answering = 'table-question-answering'
-    sentence_embedding = 'sentence-embedding'
     fill_mask = 'fill-mask'
     text_summarization = 'text-summarization'
     question_answering = 'question-answering'
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 2dbe7045..105b3ffa 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -82,7 +82,8 @@ def get_model_type(model_dir):
     this file does not exist, the method will try to get the 'model_type' field
     from the config.json.
 
-    @param model_dir: The local model dir to use. @return: The model type
+    Args:
+        model_dir: The local model dir to use. @return: The model type
     string, returns None if nothing is found.
     """
     try:
@@ -112,8 +113,11 @@ def parse_label_mapping(model_dir):
     2. Try to read label-id mapping from the configuration.json
     3. Try to read label-id mapping from the config.json
 
-    @param model_dir: The local model dir to use.
-    @return: The label2id mapping if found.
+    Args:
+        model_dir: The local model dir to use.
+
+    Returns:
+        The label2id mapping if found.
     """
     import json
     import os
diff --git a/modelscope/utils/nlp/space/args.py b/modelscope/utils/nlp/space/args.py
index d9e91e74..c92401c5 100644
--- a/modelscope/utils/nlp/space/args.py
+++ b/modelscope/utils/nlp/space/args.py
@@ -1,6 +1,4 @@
-"""
-Parse argument.
-"""
+# Copyright (c) Alibaba, Inc. and its affiliates.
 
 import argparse
 
diff --git a/modelscope/utils/nlp/space/clean_dataset.py b/modelscope/utils/nlp/space/clean_dataset.py
index 4578ccc4..2c971b10 100644
--- a/modelscope/utils/nlp/space/clean_dataset.py
+++ b/modelscope/utils/nlp/space/clean_dataset.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import re
 
diff --git a/modelscope/utils/nlp/space/criterions.py b/modelscope/utils/nlp/space/criterions.py
index 60f98457..82ef4ba5 100644
--- a/modelscope/utils/nlp/space/criterions.py
+++ b/modelscope/utils/nlp/space/criterions.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import torch
 import torch.nn.functional as F
 from torch.nn.modules.loss import _Loss
diff --git a/modelscope/utils/nlp/space/db_ops.py b/modelscope/utils/nlp/space/db_ops.py
index 880b018b..d1d14ef9 100644
--- a/modelscope/utils/nlp/space/db_ops.py
+++ b/modelscope/utils/nlp/space/db_ops.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import os
 import random
 import sqlite3
diff --git a/modelscope/utils/nlp/space/ontology.py b/modelscope/utils/nlp/space/ontology.py
index 99b084bb..c55d12e1 100644
--- a/modelscope/utils/nlp/space/ontology.py
+++ b/modelscope/utils/nlp/space/ontology.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 all_domains = [
     'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'police', 'hospital'
 ]
diff --git a/modelscope/utils/nlp/space/scores.py b/modelscope/utils/nlp/space/scores.py
index fe0a8a17..eb6dd41c 100644
--- a/modelscope/utils/nlp/space/scores.py
+++ b/modelscope/utils/nlp/space/scores.py
@@ -1,3 +1,6 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
 def hierarchical_set_score(frame1, frame2):
     # deal with empty frame
     if not (frame1 and frame2):
diff --git a/modelscope/utils/nlp/space/utils.py b/modelscope/utils/nlp/space/utils.py
index 81d1b1c5..56e67671 100644
--- a/modelscope/utils/nlp/space/utils.py
+++ b/modelscope/utils/nlp/space/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 import logging
 from collections import OrderedDict
 
diff --git a/modelscope/utils/nlp/space/utils_dst.py b/modelscope/utils/nlp/space/utils_dst.py
index 2a7e67d7..6277172e 100644
--- a/modelscope/utils/nlp/space/utils_dst.py
+++ b/modelscope/utils/nlp/space/utils_dst.py
@@ -1,3 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import List
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.nlp import DialogStateTrackingPipeline
+
+
+def tracking_and_print_dialog_states(
+        test_case, pipelines: List[DialogStateTrackingPipeline]):
+    import json
+    pipelines_len = len(pipelines)
+    history_states = [{}]
+    utter = {}
+    for step, item in enumerate(test_case):
+        utter.update(item)
+        result = pipelines[step % pipelines_len]({
+            'utter':
+            utter,
+            'history_states':
+            history_states
+        })
+        print(json.dumps(result))
+
+        history_states.extend([result[OutputKeys.OUTPUT], {}])
+
+
 def batch_to_device(batch, device):
     batch_on_device = []
     for element in batch:
diff --git a/modelscope/utils/nlp/space_T_en/__init__.py b/modelscope/utils/nlp/space_T_en/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/space_T_en/utils.py
similarity index 52%
rename from modelscope/utils/nlp/nlp_utils.py
rename to modelscope/utils/nlp/space_T_en/utils.py
index bfeaf924..d884c241 100644
--- a/modelscope/utils/nlp/nlp_utils.py
+++ b/modelscope/utils/nlp/space_T_en/utils.py
@@ -1,8 +1,9 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import List
 
 from modelscope.outputs import OutputKeys
-from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline,
-                                      DialogStateTrackingPipeline)
+from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 
 
 def text2sql_tracking_and_print_results(
@@ -22,22 +23,3 @@ def text2sql_tracking_and_print_results(
             print(results)
             last_sql = results[OutputKeys.OUTPUT][OutputKeys.TEXT]
             history.append(item)
-
-
-def tracking_and_print_dialog_states(
-        test_case, pipelines: List[DialogStateTrackingPipeline]):
-    import json
-    pipelines_len = len(pipelines)
-    history_states = [{}]
-    utter = {}
-    for step, item in enumerate(test_case):
-        utter.update(item)
-        result = pipelines[step % pipelines_len]({
-            'utter':
-            utter,
-            'history_states':
-            history_states
-        })
-        print(json.dumps(result))
-
-        history_states.extend([result[OutputKeys.OUTPUT], {}])
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index d6994bd3..5284aa43 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -74,6 +74,7 @@ class Registry(object):
             raise KeyError(f'{module_name} is already registered in '
                            f'{self._name}[{group_key}]')
         self._modules[group_key][module_name] = module_cls
+        module_cls.group_key = group_key
 
     def register_module(self,
                         group_key: str = default_group,
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index 3c1e5c1c..8045d3e9 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -7,6 +7,7 @@ import pickle
 import random
 import shutil
 import tempfile
+from collections import OrderedDict
 from collections.abc import Mapping
 from pathlib import Path
 from types import FunctionType
@@ -14,6 +15,7 @@ from typing import Any, Dict, Union
 
 import json
 import numpy as np
+import torch
 import torch.optim
 from torch import nn
 
@@ -69,9 +71,10 @@ class RegressTool:
                                       **kwargs):
         """Monitor a pytorch module in a single forward.
 
-        @param module: A torch module
-        @param file_name: The file_name to store or load file
-        @param compare_fn: A custom fn used to compare the results manually.
+        Args:
+            module: A torch module
+            file_name: The file_name to store or load file
+            compare_fn: A custom fn used to compare the results manually.
 
         >>> def compare_fn(v1, v2, key, type):
         >>>     return None
@@ -80,6 +83,10 @@ class RegressTool:
         v2 is the value of current version
         key is the key of submodules
         type is in one of 'input', 'output'
+
+            kwargs:
+            atol: The absolute gap between two np arrays.
+            rtol: The relative gap between two np arrays.
         """
         baseline = os.getenv('REGRESSION_BASELINE')
         if baseline is None or self.baseline is None:
@@ -144,20 +151,24 @@ class RegressTool:
         This is usually useful when you try to change some dangerous code
         which has the risk of affecting the training loop.
 
-        @param trainer: A dict or an object contains the model/optimizer/lr_scheduler
-        @param file_name: The file_name to store or load file
-        @param level: The regression level.
+        Args:
+            trainer: A dict or an object contains the model/optimizer/lr_scheduler
+            file_name: The file_name to store or load file
+            level: The regression level.
             'strict' for matching every single tensor.
                      Please make sure the parameters of head are fixed
                      and the drop-out rate is zero.
             'config' for matching the initial config, like cfg file, optimizer param_groups,
                      lr_scheduler params and the random seed.
             'metric' for compare the best metrics in the evaluation loop.
-        @param compare_fn: A custom fn used to compare the results manually.
-        @param ignore_keys: The keys to ignore of the named_parameters.
-        @param compare_random: If to compare random setttings, default True.
-        @param reset_dropout: Reset all dropout modules to 0.0.
-        @param lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+            compare_fn: A custom fn used to compare the results manually.
+            ignore_keys: The keys to ignore of the named_parameters.
+            compare_random: If to compare random setttings, default True.
+            reset_dropout: Reset all dropout modules to 0.0.
+            lazy_stop_callback: A callback passed in, when the moniting is over, this callback will be called.
+            kwargs:
+            atol: The absolute gap between two np arrays.
+            rtol: The relative gap between two np arrays.
 
         >>> def compare_fn(v1, v2, key, type):
         >>>     return None
@@ -353,16 +364,22 @@ def compare_module(module1: nn.Module, module2: nn.Module):
 
 
 def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
-    import torch
+    try:
+        from modelscope.outputs import ModelOutputBase
+    except ImportError:
+        ModelOutputBase = dict
     "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
-    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(
-            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
-    if isinstance(tensors, Mapping):
-        return {
+    if isinstance(tensors, (Mapping, ModelOutputBase)):
+        return OrderedDict({
             k: numpify_tensor_nested(t, reduction, clip_value)
             for k, t in tensors.items()
-        }
+        })
+    if isinstance(tensors, list):
+        return list(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
+    if isinstance(tensors, tuple):
+        return tuple(
+            numpify_tensor_nested(t, reduction, clip_value) for t in tensors)
     if isinstance(tensors, torch.Tensor):
         t: np.ndarray = tensors.cpu().numpy()
         if clip_value is not None:
@@ -377,12 +394,19 @@ def numpify_tensor_nested(tensors, reduction=None, clip_value=10000):
 
 
 def detach_tensor_nested(tensors):
-    import torch
+    try:
+        from modelscope.outputs import ModelOutputBase
+    except ImportError:
+        ModelOutputBase = dict
     "Detach `tensors` (even if it's a nested list/tuple of tensors)."
-    if isinstance(tensors, (list, tuple)):
-        return type(tensors)(detach_tensor_nested(t) for t in tensors)
-    if isinstance(tensors, Mapping):
-        return {k: detach_tensor_nested(t) for k, t in tensors.items()}
+    if isinstance(tensors, (Mapping, ModelOutputBase)):
+        return OrderedDict(
+            {k: detach_tensor_nested(t)
+             for k, t in tensors.items()})
+    if isinstance(tensors, list):
+        return list(detach_tensor_nested(t) for t in tensors)
+    if isinstance(tensors, tuple):
+        return tuple(detach_tensor_nested(t) for t in tensors)
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index 406d671f..8f580d19 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -8,8 +8,11 @@ def torch_nested_numpify(tensors):
 
     NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
 
-    @param tensors: Nested torch tensors.
-    @return: The numpify tensors.
+    Args:
+        tensors: Nested torch tensors.
+
+    Returns:
+        The numpify tensors.
     """
 
     import torch
@@ -30,8 +33,11 @@ def torch_nested_detach(tensors):
 
     NOTE: If the type of input tensors is dict-like(Mapping, dict, OrderedDict, etc.), the return type will be dict.
 
-    @param tensors: Nested torch tensors.
-    @return: The detached tensors.
+    Args:
+        tensors: Nested torch tensors.
+
+    Returns:
+        The detached tensors.
     """
 
     import torch
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
index 97926539..0e4f8349 100644
--- a/tests/export/test_export_sbert_sequence_classification.py
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -3,9 +3,10 @@ import os
 import shutil
 import tempfile
 import unittest
+from collections import OrderedDict
 
 from modelscope.exporters import Exporter, TorchModelExporter
-from modelscope.models.base import Model
+from modelscope.models import Model
 from modelscope.utils.test_utils import test_level
 
 
@@ -27,10 +28,42 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
         model = Model.from_pretrained(self.model_id)
         print(
             Exporter.from_model(model).export_onnx(
-                shape=(2, 256), outputs=self.tmp_dir))
+                shape=(2, 256), output_dir=self.tmp_dir))
         print(
             TorchModelExporter.from_model(model).export_torch_script(
-                shape=(2, 256), outputs=self.tmp_dir))
+                shape=(2, 256), output_dir=self.tmp_dir))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_export_outer_module(self):
+        from transformers import BertForSequenceClassification, BertTokenizerFast
+        model = BertForSequenceClassification.from_pretrained(
+            'bert-base-uncased')
+        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+        dummy_inputs = tokenizer(
+            tokenizer.unk_token,
+            padding='max_length',
+            max_length=256,
+            return_tensors='pt')
+        dynamic_axis = {0: 'batch', 1: 'sequence'}
+        inputs = OrderedDict([
+            ('input_ids', dynamic_axis),
+            ('attention_mask', dynamic_axis),
+            ('token_type_ids', dynamic_axis),
+        ])
+        outputs = OrderedDict({'logits': {0: 'batch'}})
+        output_files = TorchModelExporter().export_onnx(
+            model=model,
+            dummy_inputs=dummy_inputs,
+            inputs=inputs,
+            outputs=outputs,
+            output_dir='/tmp')
+        print(output_files)
+        output_files = TorchModelExporter().export_torch_script(
+            model=model,
+            dummy_inputs=dummy_inputs,
+            output_dir='/tmp',
+            strict=False)
+        print(output_files)
 
 
 if __name__ == '__main__':
diff --git a/tests/hub/test_download_dataset.py b/tests/hub/test_download_dataset.py
new file mode 100644
index 00000000..29b5d1ab
--- /dev/null
+++ b/tests/hub/test_download_dataset.py
@@ -0,0 +1,709 @@
+import unittest
+
+from modelscope.msdatasets import MsDataset
+from modelscope.utils.test_utils import test_level
+
+
+class DownloadDatasetTest(unittest.TestCase):
+
+    def setUp(self):
+        self.subset_count = 10
+
+    def download_subset(self, dataset, subset_name):
+        dataset = MsDataset.load(dataset, subset_name=subset_name)
+        if isinstance(dataset, MsDataset):
+            lens = len(dataset)
+            print(f'dataset {subset_name} len: {lens}')
+            self.assertTrue(lens > 0)
+        else:
+            assert isinstance(dataset, dict)
+            lens = {key: len(subset) for key, subset in dataset.items()}
+            print(f'dataset {subset_name} len: {lens}')
+            self.assertTrue(all([_len > 0 for _len in lens.values()]))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_glue(self):
+        subset = [
+            'cola', 'sst2', 'mrpc', 'qqp', 'stsb', 'mnli', 'mnli_mismatched',
+            'mnli_matched', 'qnli', 'rte', 'wnli', 'ax'
+        ]
+        for subset_name in subset:
+            self.download_subset('glue', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_super_glue(self):
+        subset = [
+            'boolq', 'cb', 'copa', 'multirc', 'record', 'rte', 'wic', 'wsc',
+            'wsc.fixed', 'axb', 'axg'
+        ]
+        for subset_name in subset:
+            self.download_subset('super_glue', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_nllb(self):
+        subset = [
+            'ace_Latn-ban_Latn', 'ace_Latn-bjn_Latn', 'ace_Latn-bug_Latn',
+            'ace_Latn-ceb_Latn', 'ace_Latn-eng_Latn', 'ace_Latn-fij_Latn',
+            'ace_Latn-ilo_Latn', 'ace_Latn-jav_Latn', 'ace_Latn-min_Latn',
+            'ace_Latn-mri_Latn', 'ace_Latn-pag_Latn', 'ace_Latn-plt_Latn',
+            'ace_Latn-smo_Latn', 'ace_Latn-sun_Latn', 'ace_Latn-war_Latn',
+            'afr_Latn-aka_Latn', 'afr_Latn-amh_Ethi', 'afr_Latn-bam_Latn',
+            'afr_Latn-bem_Latn', 'afr_Latn-cjk_Latn', 'afr_Latn-dik_Latn',
+            'afr_Latn-dyu_Latn', 'afr_Latn-eng_Latn', 'afr_Latn-ewe_Latn',
+            'afr_Latn-fon_Latn', 'afr_Latn-fra_Latn', 'afr_Latn-fuv_Latn',
+            'afr_Latn-gaz_Latn', 'afr_Latn-hau_Latn', 'afr_Latn-ibo_Latn',
+            'afr_Latn-kam_Latn', 'afr_Latn-kik_Latn', 'afr_Latn-kin_Latn',
+            'afr_Latn-kmb_Latn', 'afr_Latn-knc_Arab', 'afr_Latn-knc_Latn',
+            'afr_Latn-kon_Latn', 'afr_Latn-lin_Latn', 'afr_Latn-lua_Latn',
+            'afr_Latn-lug_Latn', 'afr_Latn-luo_Latn', 'afr_Latn-nso_Latn',
+            'afr_Latn-nus_Latn', 'afr_Latn-nya_Latn', 'afr_Latn-run_Latn',
+            'afr_Latn-sna_Latn', 'afr_Latn-som_Latn', 'afr_Latn-sot_Latn',
+            'afr_Latn-ssw_Latn', 'afr_Latn-swh_Latn', 'afr_Latn-tir_Ethi',
+            'afr_Latn-tsn_Latn', 'afr_Latn-tso_Latn', 'afr_Latn-tum_Latn',
+            'afr_Latn-twi_Latn', 'afr_Latn-umb_Latn', 'afr_Latn-wol_Latn',
+            'afr_Latn-xho_Latn', 'afr_Latn-yor_Latn', 'afr_Latn-zul_Latn',
+            'aka_Latn-amh_Ethi', 'aka_Latn-bam_Latn', 'aka_Latn-bem_Latn',
+            'aka_Latn-cjk_Latn', 'aka_Latn-dik_Latn', 'aka_Latn-dyu_Latn',
+            'aka_Latn-eng_Latn', 'aka_Latn-ewe_Latn', 'aka_Latn-fon_Latn',
+            'aka_Latn-fra_Latn', 'aka_Latn-fuv_Latn', 'aka_Latn-gaz_Latn',
+            'aka_Latn-hau_Latn', 'aka_Latn-ibo_Latn', 'aka_Latn-kam_Latn',
+            'aka_Latn-kik_Latn', 'aka_Latn-kin_Latn', 'aka_Latn-kmb_Latn',
+            'aka_Latn-knc_Arab', 'aka_Latn-knc_Latn', 'aka_Latn-kon_Latn',
+            'aka_Latn-lin_Latn', 'aka_Latn-lua_Latn', 'aka_Latn-lug_Latn',
+            'aka_Latn-luo_Latn', 'aka_Latn-nso_Latn', 'aka_Latn-nus_Latn',
+            'aka_Latn-nya_Latn', 'aka_Latn-run_Latn', 'aka_Latn-sna_Latn',
+            'aka_Latn-som_Latn', 'aka_Latn-sot_Latn', 'aka_Latn-ssw_Latn',
+            'aka_Latn-swh_Latn', 'aka_Latn-tir_Ethi', 'aka_Latn-tsn_Latn',
+            'aka_Latn-tso_Latn', 'aka_Latn-tum_Latn', 'aka_Latn-twi_Latn',
+            'aka_Latn-umb_Latn', 'aka_Latn-wol_Latn', 'aka_Latn-xho_Latn',
+            'aka_Latn-yor_Latn', 'aka_Latn-zul_Latn', 'amh_Ethi-bam_Latn',
+            'amh_Ethi-bem_Latn', 'amh_Ethi-cjk_Latn', 'amh_Ethi-dik_Latn',
+            'amh_Ethi-dyu_Latn', 'amh_Ethi-eng_Latn', 'amh_Ethi-ewe_Latn',
+            'amh_Ethi-fon_Latn', 'amh_Ethi-fra_Latn', 'amh_Ethi-fuv_Latn',
+            'amh_Ethi-gaz_Latn', 'amh_Ethi-hau_Latn', 'amh_Ethi-ibo_Latn',
+            'amh_Ethi-kam_Latn', 'amh_Ethi-kik_Latn', 'amh_Ethi-kin_Latn',
+            'amh_Ethi-kmb_Latn', 'amh_Ethi-knc_Arab', 'amh_Ethi-knc_Latn',
+            'amh_Ethi-kon_Latn', 'amh_Ethi-lin_Latn', 'amh_Ethi-lua_Latn',
+            'amh_Ethi-lug_Latn', 'amh_Ethi-luo_Latn', 'amh_Ethi-nso_Latn',
+            'amh_Ethi-nus_Latn', 'amh_Ethi-nya_Latn', 'amh_Ethi-run_Latn',
+            'amh_Ethi-sna_Latn', 'amh_Ethi-som_Latn', 'amh_Ethi-sot_Latn',
+            'amh_Ethi-ssw_Latn', 'amh_Ethi-swh_Latn', 'amh_Ethi-tir_Ethi',
+            'amh_Ethi-tsn_Latn', 'amh_Ethi-tso_Latn', 'amh_Ethi-tum_Latn',
+            'amh_Ethi-twi_Latn', 'amh_Ethi-umb_Latn', 'amh_Ethi-wol_Latn',
+            'amh_Ethi-xho_Latn', 'amh_Ethi-yor_Latn', 'amh_Ethi-zul_Latn',
+            'arb_Arab-ckb_Arab', 'arb_Arab-crh_Latn', 'arb_Arab-dik_Latn',
+            'arb_Arab-diq_Latn', 'arb_Arab-fuv_Latn', 'arb_Arab-kmr_Latn',
+            'arb_Arab-knc_Latn', 'arb_Arab-nus_Latn', 'arb_Arab-som_Latn',
+            'arb_Arab-tat_Cyrl', 'arb_Arab-tzm_Tfng', 'arb_Arab-urd_Arab',
+            'arb_Arab-wol_Latn', 'asm_Beng-awa_Deva', 'asm_Beng-ben_Beng',
+            'asm_Beng-bho_Deva', 'asm_Beng-eng_Latn', 'asm_Beng-guj_Gujr',
+            'asm_Beng-hin_Deva', 'asm_Beng-hne_Deva', 'asm_Beng-kan_Knda',
+            'asm_Beng-kas_Arab', 'asm_Beng-kas_Deva', 'asm_Beng-mag_Deva',
+            'asm_Beng-mai_Deva', 'asm_Beng-mal_Mlym', 'asm_Beng-mar_Deva',
+            'asm_Beng-npi_Deva', 'asm_Beng-ory_Orya', 'asm_Beng-pan_Guru',
+            'asm_Beng-san_Deva', 'asm_Beng-sat_Beng', 'asm_Beng-sin_Sinh',
+            'asm_Beng-snd_Arab', 'asm_Beng-tam_Taml', 'asm_Beng-tel_Telu',
+            'asm_Beng-urd_Arab', 'awa_Deva-ben_Beng', 'awa_Deva-bho_Deva',
+            'awa_Deva-eng_Latn', 'awa_Deva-guj_Gujr', 'awa_Deva-hin_Deva',
+            'awa_Deva-hne_Deva', 'awa_Deva-kan_Knda', 'awa_Deva-kas_Arab',
+            'awa_Deva-kas_Deva', 'awa_Deva-mag_Deva', 'awa_Deva-mai_Deva',
+            'awa_Deva-mal_Mlym', 'awa_Deva-mar_Deva', 'awa_Deva-npi_Deva',
+            'awa_Deva-ory_Orya', 'awa_Deva-pan_Guru', 'awa_Deva-san_Deva',
+            'awa_Deva-sat_Beng', 'awa_Deva-sin_Sinh', 'awa_Deva-snd_Arab',
+            'awa_Deva-tam_Taml', 'awa_Deva-tel_Telu', 'awa_Deva-urd_Arab',
+            'ayr_Latn-eng_Latn', 'ayr_Latn-spa_Latn', 'azb_Arab-eng_Latn',
+            'azj_Latn-eng_Latn', 'azj_Latn-rus_Cyrl', 'bak_Cyrl-crh_Latn',
+            'bak_Cyrl-eng_Latn', 'bak_Cyrl-kir_Cyrl', 'bak_Cyrl-rus_Cyrl',
+            'bak_Cyrl-tat_Cyrl', 'bak_Cyrl-tuk_Latn', 'bak_Cyrl-uig_Arab',
+            'bak_Cyrl-uzn_Latn', 'bam_Latn-bem_Latn', 'bam_Latn-cjk_Latn',
+            'bam_Latn-dik_Latn', 'bam_Latn-dyu_Latn', 'bam_Latn-eng_Latn',
+            'bam_Latn-ewe_Latn', 'bam_Latn-fon_Latn', 'bam_Latn-fra_Latn',
+            'bam_Latn-fuv_Latn', 'bam_Latn-gaz_Latn', 'bam_Latn-hau_Latn',
+            'bam_Latn-ibo_Latn', 'bam_Latn-kam_Latn', 'bam_Latn-kik_Latn',
+            'bam_Latn-kin_Latn', 'bam_Latn-kmb_Latn', 'bam_Latn-knc_Arab',
+            'bam_Latn-knc_Latn', 'bam_Latn-kon_Latn', 'bam_Latn-lin_Latn',
+            'bam_Latn-lua_Latn', 'bam_Latn-lug_Latn', 'bam_Latn-luo_Latn',
+            'bam_Latn-nso_Latn', 'bam_Latn-nus_Latn', 'bam_Latn-nya_Latn',
+            'bam_Latn-run_Latn', 'bam_Latn-sna_Latn', 'bam_Latn-som_Latn',
+            'bam_Latn-sot_Latn', 'bam_Latn-ssw_Latn', 'bam_Latn-swh_Latn',
+            'bam_Latn-tir_Ethi', 'bam_Latn-tsn_Latn', 'bam_Latn-tso_Latn',
+            'bam_Latn-tum_Latn', 'bam_Latn-twi_Latn', 'bam_Latn-umb_Latn',
+            'bam_Latn-wol_Latn', 'bam_Latn-xho_Latn', 'bam_Latn-yor_Latn',
+            'bam_Latn-zul_Latn', 'ban_Latn-bjn_Latn', 'ban_Latn-bug_Latn',
+            'ban_Latn-ceb_Latn', 'ban_Latn-eng_Latn', 'ban_Latn-fij_Latn',
+            'ban_Latn-ilo_Latn', 'ban_Latn-jav_Latn', 'ban_Latn-min_Latn',
+            'ban_Latn-mri_Latn', 'ban_Latn-pag_Latn', 'ban_Latn-plt_Latn',
+            'ban_Latn-smo_Latn', 'ban_Latn-sun_Latn', 'ban_Latn-war_Latn',
+            'bel_Cyrl-eng_Latn', 'bel_Cyrl-rus_Cyrl', 'bem_Latn-cjk_Latn',
+            'bem_Latn-dik_Latn', 'bem_Latn-dyu_Latn', 'bem_Latn-eng_Latn',
+            'bem_Latn-ewe_Latn', 'bem_Latn-fon_Latn', 'bem_Latn-fra_Latn',
+            'bem_Latn-fuv_Latn', 'bem_Latn-gaz_Latn', 'bem_Latn-hau_Latn',
+            'bem_Latn-ibo_Latn', 'bem_Latn-kam_Latn', 'bem_Latn-kik_Latn',
+            'bem_Latn-kin_Latn', 'bem_Latn-kmb_Latn', 'bem_Latn-knc_Arab',
+            'bem_Latn-knc_Latn', 'bem_Latn-kon_Latn', 'bem_Latn-lin_Latn',
+            'bem_Latn-lua_Latn', 'bem_Latn-lug_Latn', 'bem_Latn-luo_Latn',
+            'bem_Latn-nso_Latn', 'bem_Latn-nus_Latn', 'bem_Latn-nya_Latn',
+            'bem_Latn-run_Latn', 'bem_Latn-sna_Latn', 'bem_Latn-som_Latn',
+            'bem_Latn-sot_Latn', 'bem_Latn-ssw_Latn', 'bem_Latn-swh_Latn',
+            'bem_Latn-tir_Ethi', 'bem_Latn-tsn_Latn', 'bem_Latn-tso_Latn',
+            'bem_Latn-tum_Latn', 'bem_Latn-twi_Latn', 'bem_Latn-umb_Latn',
+            'bem_Latn-wol_Latn', 'bem_Latn-xho_Latn', 'bem_Latn-yor_Latn',
+            'bem_Latn-zul_Latn', 'ben_Beng-bho_Deva', 'ben_Beng-eng_Latn',
+            'ben_Beng-guj_Gujr', 'ben_Beng-hin_Deva', 'ben_Beng-hne_Deva',
+            'ben_Beng-kan_Knda', 'ben_Beng-kas_Arab', 'ben_Beng-kas_Deva',
+            'ben_Beng-mag_Deva', 'ben_Beng-mai_Deva', 'ben_Beng-mal_Mlym',
+            'ben_Beng-mar_Deva', 'ben_Beng-npi_Deva', 'ben_Beng-ory_Orya',
+            'ben_Beng-pan_Guru', 'ben_Beng-pbt_Arab', 'ben_Beng-san_Deva',
+            'ben_Beng-sat_Beng', 'ben_Beng-sin_Sinh', 'ben_Beng-snd_Arab',
+            'ben_Beng-tam_Taml', 'ben_Beng-tel_Telu', 'ben_Beng-urd_Arab',
+            'bho_Deva-eng_Latn', 'bho_Deva-guj_Gujr', 'bho_Deva-hin_Deva',
+            'bho_Deva-hne_Deva', 'bho_Deva-kan_Knda', 'bho_Deva-kas_Arab',
+            'bho_Deva-kas_Deva', 'bho_Deva-mag_Deva', 'bho_Deva-mai_Deva',
+            'bho_Deva-mal_Mlym', 'bho_Deva-mar_Deva', 'bho_Deva-npi_Deva',
+            'bho_Deva-ory_Orya', 'bho_Deva-pan_Guru', 'bho_Deva-san_Deva',
+            'bho_Deva-sat_Beng', 'bho_Deva-sin_Sinh', 'bho_Deva-snd_Arab',
+            'bho_Deva-tam_Taml', 'bho_Deva-tel_Telu', 'bho_Deva-urd_Arab',
+            'bjn_Latn-bug_Latn', 'bjn_Latn-ceb_Latn', 'bjn_Latn-eng_Latn',
+            'bjn_Latn-fij_Latn', 'bjn_Latn-ilo_Latn', 'bjn_Latn-ind_Latn',
+            'bjn_Latn-jav_Latn', 'bjn_Latn-min_Latn', 'bjn_Latn-mri_Latn',
+            'bjn_Latn-pag_Latn', 'bjn_Latn-plt_Latn', 'bjn_Latn-smo_Latn',
+            'bjn_Latn-sun_Latn', 'bjn_Latn-war_Latn', 'bod_Tibt-eng_Latn',
+            'bos_Latn-eng_Latn', 'bug_Latn-ceb_Latn', 'bug_Latn-eng_Latn',
+            'bug_Latn-fij_Latn', 'bug_Latn-ilo_Latn', 'bug_Latn-jav_Latn',
+            'bug_Latn-min_Latn', 'bug_Latn-mri_Latn', 'bug_Latn-pag_Latn',
+            'bug_Latn-plt_Latn', 'bug_Latn-smo_Latn', 'bug_Latn-sun_Latn',
+            'bug_Latn-war_Latn', 'ceb_Latn-eng_Latn', 'ceb_Latn-fij_Latn',
+            'ceb_Latn-ilo_Latn', 'ceb_Latn-jav_Latn', 'ceb_Latn-min_Latn',
+            'ceb_Latn-mri_Latn', 'ceb_Latn-pag_Latn', 'ceb_Latn-plt_Latn',
+            'ceb_Latn-smo_Latn', 'ceb_Latn-sun_Latn', 'ceb_Latn-war_Latn',
+            'cjk_Latn-dik_Latn', 'cjk_Latn-dyu_Latn', 'cjk_Latn-eng_Latn',
+            'cjk_Latn-ewe_Latn', 'cjk_Latn-fon_Latn', 'cjk_Latn-fra_Latn',
+            'cjk_Latn-fuv_Latn', 'cjk_Latn-gaz_Latn', 'cjk_Latn-hau_Latn',
+            'cjk_Latn-ibo_Latn', 'cjk_Latn-kam_Latn', 'cjk_Latn-kik_Latn',
+            'cjk_Latn-kin_Latn', 'cjk_Latn-kmb_Latn', 'cjk_Latn-knc_Arab',
+            'cjk_Latn-knc_Latn', 'cjk_Latn-kon_Latn', 'cjk_Latn-lin_Latn',
+            'cjk_Latn-lua_Latn', 'cjk_Latn-lug_Latn', 'cjk_Latn-luo_Latn',
+            'cjk_Latn-nso_Latn', 'cjk_Latn-nus_Latn', 'cjk_Latn-nya_Latn',
+            'cjk_Latn-por_Latn', 'cjk_Latn-run_Latn', 'cjk_Latn-sna_Latn',
+            'cjk_Latn-som_Latn', 'cjk_Latn-sot_Latn', 'cjk_Latn-ssw_Latn',
+            'cjk_Latn-swh_Latn', 'cjk_Latn-tir_Ethi', 'cjk_Latn-tsn_Latn',
+            'cjk_Latn-tso_Latn', 'cjk_Latn-tum_Latn', 'cjk_Latn-twi_Latn',
+            'cjk_Latn-umb_Latn', 'cjk_Latn-wol_Latn', 'cjk_Latn-xho_Latn',
+            'cjk_Latn-yor_Latn', 'cjk_Latn-zul_Latn', 'ckb_Arab-diq_Latn',
+            'ckb_Arab-eng_Latn', 'ckb_Arab-kmr_Latn', 'ckb_Arab-pbt_Arab',
+            'ckb_Arab-prs_Arab', 'ckb_Arab-tgk_Cyrl', 'crh_Latn-eng_Latn',
+            'crh_Latn-kir_Cyrl', 'crh_Latn-rus_Cyrl', 'crh_Latn-tat_Cyrl',
+            'crh_Latn-tuk_Latn', 'crh_Latn-uig_Arab', 'crh_Latn-uzn_Latn',
+            'cym_Latn-eng_Latn', 'dik_Latn-dyu_Latn', 'dik_Latn-eng_Latn',
+            'dik_Latn-ewe_Latn', 'dik_Latn-fon_Latn', 'dik_Latn-fra_Latn',
+            'dik_Latn-fuv_Latn', 'dik_Latn-gaz_Latn', 'dik_Latn-hau_Latn',
+            'dik_Latn-ibo_Latn', 'dik_Latn-kam_Latn', 'dik_Latn-kik_Latn',
+            'dik_Latn-kin_Latn', 'dik_Latn-kmb_Latn', 'dik_Latn-knc_Arab',
+            'dik_Latn-knc_Latn', 'dik_Latn-kon_Latn', 'dik_Latn-lin_Latn',
+            'dik_Latn-lua_Latn', 'dik_Latn-lug_Latn', 'dik_Latn-luo_Latn',
+            'dik_Latn-nso_Latn', 'dik_Latn-nus_Latn', 'dik_Latn-nya_Latn',
+            'dik_Latn-run_Latn', 'dik_Latn-sna_Latn', 'dik_Latn-som_Latn',
+            'dik_Latn-sot_Latn', 'dik_Latn-ssw_Latn', 'dik_Latn-swh_Latn',
+            'dik_Latn-tir_Ethi', 'dik_Latn-tsn_Latn', 'dik_Latn-tso_Latn',
+            'dik_Latn-tum_Latn', 'dik_Latn-twi_Latn', 'dik_Latn-umb_Latn',
+            'dik_Latn-wol_Latn', 'dik_Latn-xho_Latn', 'dik_Latn-yor_Latn',
+            'dik_Latn-zul_Latn', 'diq_Latn-eng_Latn', 'diq_Latn-kmr_Latn',
+            'diq_Latn-pbt_Arab', 'diq_Latn-prs_Arab', 'diq_Latn-tgk_Cyrl',
+            'dyu_Latn-eng_Latn', 'dyu_Latn-ewe_Latn', 'dyu_Latn-fon_Latn',
+            'dyu_Latn-fra_Latn', 'dyu_Latn-fuv_Latn', 'dyu_Latn-gaz_Latn',
+            'dyu_Latn-hau_Latn', 'dyu_Latn-ibo_Latn', 'dyu_Latn-kam_Latn',
+            'dyu_Latn-kik_Latn', 'dyu_Latn-kin_Latn', 'dyu_Latn-kmb_Latn',
+            'dyu_Latn-knc_Arab', 'dyu_Latn-knc_Latn', 'dyu_Latn-kon_Latn',
+            'dyu_Latn-lin_Latn', 'dyu_Latn-lua_Latn', 'dyu_Latn-lug_Latn',
+            'dyu_Latn-luo_Latn', 'dyu_Latn-nso_Latn', 'dyu_Latn-nus_Latn',
+            'dyu_Latn-nya_Latn', 'dyu_Latn-run_Latn', 'dyu_Latn-sna_Latn',
+            'dyu_Latn-som_Latn', 'dyu_Latn-sot_Latn', 'dyu_Latn-ssw_Latn',
+            'dyu_Latn-swh_Latn', 'dyu_Latn-tir_Ethi', 'dyu_Latn-tsn_Latn',
+            'dyu_Latn-tso_Latn', 'dyu_Latn-tum_Latn', 'dyu_Latn-twi_Latn',
+            'dyu_Latn-umb_Latn', 'dyu_Latn-wol_Latn', 'dyu_Latn-xho_Latn',
+            'dyu_Latn-yor_Latn', 'dyu_Latn-zul_Latn', 'dzo_Tibt-eng_Latn',
+            'eng_Latn-als_Latn', 'eng_Latn-epo_Latn', 'eng_Latn-ewe_Latn',
+            'eng_Latn-fao_Latn', 'eng_Latn-fij_Latn', 'eng_Latn-fon_Latn',
+            'eng_Latn-fur_Latn', 'eng_Latn-fuv_Latn', 'eng_Latn-gaz_Latn',
+            'eng_Latn-gla_Latn', 'eng_Latn-gle_Latn', 'eng_Latn-grn_Latn',
+            'eng_Latn-guj_Gujr', 'eng_Latn-hat_Latn', 'eng_Latn-hau_Latn',
+            'eng_Latn-hin_Deva', 'eng_Latn-hne_Deva', 'eng_Latn-hye_Armn',
+            'eng_Latn-ibo_Latn', 'eng_Latn-ilo_Latn', 'eng_Latn-jav_Latn',
+            'eng_Latn-kab_Latn', 'eng_Latn-kac_Latn', 'eng_Latn-kam_Latn',
+            'eng_Latn-kan_Knda', 'eng_Latn-kas_Arab', 'eng_Latn-kas_Deva',
+            'eng_Latn-kat_Geor', 'eng_Latn-kaz_Cyrl', 'eng_Latn-kbp_Latn',
+            'eng_Latn-kea_Latn', 'eng_Latn-khk_Cyrl', 'eng_Latn-khm_Khmr',
+            'eng_Latn-kik_Latn', 'eng_Latn-kin_Latn', 'eng_Latn-kir_Cyrl',
+            'eng_Latn-kmb_Latn', 'eng_Latn-kmr_Latn', 'eng_Latn-knc_Arab',
+            'eng_Latn-knc_Latn', 'eng_Latn-kon_Latn', 'eng_Latn-lao_Laoo',
+            'eng_Latn-lij_Latn', 'eng_Latn-lim_Latn', 'eng_Latn-lin_Latn',
+            'eng_Latn-lmo_Latn', 'eng_Latn-ltg_Latn', 'eng_Latn-ltz_Latn',
+            'eng_Latn-lua_Latn', 'eng_Latn-lug_Latn', 'eng_Latn-luo_Latn',
+            'eng_Latn-lus_Latn', 'eng_Latn-mag_Deva', 'eng_Latn-mai_Deva',
+            'eng_Latn-mal_Mlym', 'eng_Latn-mar_Deva', 'eng_Latn-min_Latn',
+            'eng_Latn-mlt_Latn', 'eng_Latn-mni_Beng', 'eng_Latn-mos_Latn',
+            'eng_Latn-mri_Latn', 'eng_Latn-mya_Mymr', 'eng_Latn-npi_Deva',
+            'eng_Latn-nso_Latn', 'eng_Latn-nus_Latn', 'eng_Latn-nya_Latn',
+            'eng_Latn-ory_Orya', 'eng_Latn-pag_Latn', 'eng_Latn-pan_Guru',
+            'eng_Latn-pap_Latn', 'eng_Latn-pbt_Arab', 'eng_Latn-plt_Latn',
+            'eng_Latn-prs_Arab', 'eng_Latn-quy_Latn', 'eng_Latn-run_Latn',
+            'eng_Latn-sag_Latn', 'eng_Latn-san_Deva', 'eng_Latn-sat_Beng',
+            'eng_Latn-scn_Latn', 'eng_Latn-shn_Mymr', 'eng_Latn-sin_Sinh',
+            'eng_Latn-smo_Latn', 'eng_Latn-sna_Latn', 'eng_Latn-snd_Arab',
+            'eng_Latn-som_Latn', 'eng_Latn-sot_Latn', 'eng_Latn-srd_Latn',
+            'eng_Latn-ssw_Latn', 'eng_Latn-sun_Latn', 'eng_Latn-swh_Latn',
+            'eng_Latn-szl_Latn', 'eng_Latn-tam_Taml', 'eng_Latn-taq_Latn',
+            'eng_Latn-tat_Cyrl', 'eng_Latn-tel_Telu', 'eng_Latn-tgk_Cyrl',
+            'eng_Latn-tgl_Latn', 'eng_Latn-tir_Ethi', 'eng_Latn-tpi_Latn',
+            'eng_Latn-tsn_Latn', 'eng_Latn-tso_Latn', 'eng_Latn-tuk_Latn',
+            'eng_Latn-tum_Latn', 'eng_Latn-twi_Latn', 'eng_Latn-tzm_Tfng',
+            'eng_Latn-uig_Arab', 'eng_Latn-umb_Latn', 'eng_Latn-urd_Arab',
+            'eng_Latn-uzn_Latn', 'eng_Latn-vec_Latn', 'eng_Latn-war_Latn',
+            'eng_Latn-wol_Latn', 'eng_Latn-xho_Latn', 'eng_Latn-ydd_Hebr',
+            'eng_Latn-yor_Latn', 'eng_Latn-zho_Hant', 'eng_Latn-zsm_Latn',
+            'eng_Latn-zul_Latn', 'epo_Latn-fra_Latn', 'ewe_Latn-fon_Latn',
+            'ewe_Latn-fra_Latn', 'ewe_Latn-fuv_Latn', 'ewe_Latn-gaz_Latn',
+            'ewe_Latn-hau_Latn', 'ewe_Latn-ibo_Latn', 'ewe_Latn-kam_Latn',
+            'ewe_Latn-kik_Latn', 'ewe_Latn-kin_Latn', 'ewe_Latn-kmb_Latn',
+            'ewe_Latn-knc_Arab', 'ewe_Latn-knc_Latn', 'ewe_Latn-kon_Latn',
+            'ewe_Latn-lin_Latn', 'ewe_Latn-lua_Latn', 'ewe_Latn-lug_Latn',
+            'ewe_Latn-luo_Latn', 'ewe_Latn-nso_Latn', 'ewe_Latn-nus_Latn',
+            'ewe_Latn-nya_Latn', 'ewe_Latn-run_Latn', 'ewe_Latn-sna_Latn',
+            'ewe_Latn-som_Latn', 'ewe_Latn-sot_Latn', 'ewe_Latn-ssw_Latn',
+            'ewe_Latn-swh_Latn', 'ewe_Latn-tir_Ethi', 'ewe_Latn-tsn_Latn',
+            'ewe_Latn-tso_Latn', 'ewe_Latn-tum_Latn', 'ewe_Latn-twi_Latn',
+            'ewe_Latn-umb_Latn', 'ewe_Latn-wol_Latn', 'ewe_Latn-xho_Latn',
+            'ewe_Latn-yor_Latn', 'ewe_Latn-zul_Latn', 'fij_Latn-hin_Deva',
+            'fij_Latn-ilo_Latn', 'fij_Latn-jav_Latn', 'fij_Latn-min_Latn',
+            'fij_Latn-mri_Latn', 'fij_Latn-pag_Latn', 'fij_Latn-plt_Latn',
+            'fij_Latn-smo_Latn', 'fij_Latn-sun_Latn', 'fij_Latn-war_Latn',
+            'fon_Latn-fra_Latn', 'fon_Latn-fuv_Latn', 'fon_Latn-gaz_Latn',
+            'fon_Latn-hau_Latn', 'fon_Latn-ibo_Latn', 'fon_Latn-kam_Latn',
+            'fon_Latn-kik_Latn', 'fon_Latn-kin_Latn', 'fon_Latn-kmb_Latn',
+            'fon_Latn-knc_Arab', 'fon_Latn-knc_Latn', 'fon_Latn-kon_Latn',
+            'fon_Latn-lin_Latn', 'fon_Latn-lua_Latn', 'fon_Latn-lug_Latn',
+            'fon_Latn-luo_Latn', 'fon_Latn-nso_Latn', 'fon_Latn-nus_Latn',
+            'fon_Latn-nya_Latn', 'fon_Latn-run_Latn', 'fon_Latn-sna_Latn',
+            'fon_Latn-som_Latn', 'fon_Latn-sot_Latn', 'fon_Latn-ssw_Latn',
+            'fon_Latn-swh_Latn', 'fon_Latn-tir_Ethi', 'fon_Latn-tsn_Latn',
+            'fon_Latn-tso_Latn', 'fon_Latn-tum_Latn', 'fon_Latn-twi_Latn',
+            'fon_Latn-umb_Latn', 'fon_Latn-wol_Latn', 'fon_Latn-xho_Latn',
+            'fon_Latn-yor_Latn', 'fon_Latn-zul_Latn', 'fra_Latn-fuv_Latn',
+            'fra_Latn-gaz_Latn', 'fra_Latn-glg_Latn', 'fra_Latn-hat_Latn',
+            'fra_Latn-hau_Latn', 'fra_Latn-ibo_Latn', 'fra_Latn-kab_Latn',
+            'fra_Latn-kam_Latn', 'fra_Latn-kik_Latn', 'fra_Latn-kin_Latn',
+            'fra_Latn-kmb_Latn', 'fra_Latn-knc_Arab', 'fra_Latn-knc_Latn',
+            'fra_Latn-kon_Latn', 'fra_Latn-lin_Latn', 'fra_Latn-ltz_Latn',
+            'fra_Latn-lua_Latn', 'fra_Latn-lug_Latn', 'fra_Latn-luo_Latn',
+            'fra_Latn-nso_Latn', 'fra_Latn-nus_Latn', 'fra_Latn-nya_Latn',
+            'fra_Latn-oci_Latn', 'fra_Latn-plt_Latn', 'fra_Latn-run_Latn',
+            'fra_Latn-sag_Latn', 'fra_Latn-scn_Latn', 'fra_Latn-sna_Latn',
+            'fra_Latn-som_Latn', 'fra_Latn-sot_Latn', 'fra_Latn-ssw_Latn',
+            'fra_Latn-swh_Latn', 'fra_Latn-tir_Ethi', 'fra_Latn-tsn_Latn',
+            'fra_Latn-tso_Latn', 'fra_Latn-tum_Latn', 'fra_Latn-twi_Latn',
+            'fra_Latn-tzm_Tfng', 'fra_Latn-umb_Latn', 'fra_Latn-wol_Latn',
+            'fra_Latn-xho_Latn', 'fra_Latn-yor_Latn', 'fra_Latn-zul_Latn',
+            'fuv_Latn-gaz_Latn', 'fuv_Latn-hau_Latn', 'fuv_Latn-ibo_Latn',
+            'fuv_Latn-kam_Latn', 'fuv_Latn-kik_Latn', 'fuv_Latn-kin_Latn',
+            'fuv_Latn-kmb_Latn', 'fuv_Latn-knc_Arab', 'fuv_Latn-knc_Latn',
+            'fuv_Latn-kon_Latn', 'fuv_Latn-lin_Latn', 'fuv_Latn-lua_Latn',
+            'fuv_Latn-lug_Latn', 'fuv_Latn-luo_Latn', 'fuv_Latn-nso_Latn',
+            'fuv_Latn-nus_Latn', 'fuv_Latn-nya_Latn', 'fuv_Latn-run_Latn',
+            'fuv_Latn-sna_Latn', 'fuv_Latn-som_Latn', 'fuv_Latn-sot_Latn',
+            'fuv_Latn-ssw_Latn', 'fuv_Latn-swh_Latn', 'fuv_Latn-tir_Ethi',
+            'fuv_Latn-tsn_Latn', 'fuv_Latn-tso_Latn', 'fuv_Latn-tum_Latn',
+            'fuv_Latn-twi_Latn', 'fuv_Latn-umb_Latn', 'fuv_Latn-wol_Latn',
+            'fuv_Latn-xho_Latn', 'fuv_Latn-yor_Latn', 'fuv_Latn-zul_Latn',
+            'gaz_Latn-run_Latn', 'gaz_Latn-sna_Latn', 'gaz_Latn-som_Latn',
+            'gaz_Latn-sot_Latn', 'gaz_Latn-ssw_Latn', 'gaz_Latn-swh_Latn',
+            'gaz_Latn-tir_Ethi', 'gaz_Latn-tsn_Latn', 'gaz_Latn-tso_Latn',
+            'gaz_Latn-tum_Latn', 'gaz_Latn-twi_Latn', 'gaz_Latn-umb_Latn',
+            'gaz_Latn-wol_Latn', 'gaz_Latn-xho_Latn', 'gaz_Latn-yor_Latn',
+            'gaz_Latn-zul_Latn', 'glg_Latn-por_Latn', 'grn_Latn-por_Latn',
+            'guj_Gujr-hin_Deva', 'guj_Gujr-hne_Deva', 'guj_Gujr-kan_Knda',
+            'guj_Gujr-kas_Arab', 'guj_Gujr-kas_Deva', 'guj_Gujr-mag_Deva',
+            'guj_Gujr-mai_Deva', 'guj_Gujr-mal_Mlym', 'guj_Gujr-mar_Deva',
+            'guj_Gujr-npi_Deva', 'guj_Gujr-ory_Orya', 'guj_Gujr-pan_Guru',
+            'guj_Gujr-san_Deva', 'guj_Gujr-sat_Beng', 'guj_Gujr-sin_Sinh',
+            'guj_Gujr-snd_Arab', 'guj_Gujr-tam_Taml', 'guj_Gujr-tel_Telu',
+            'guj_Gujr-urd_Arab', 'hau_Latn-gaz_Latn', 'hau_Latn-ibo_Latn',
+            'hau_Latn-kam_Latn', 'hau_Latn-kik_Latn', 'hau_Latn-kin_Latn',
+            'hau_Latn-kmb_Latn', 'hau_Latn-knc_Arab', 'hau_Latn-knc_Latn',
+            'hau_Latn-kon_Latn', 'hau_Latn-lin_Latn', 'hau_Latn-lua_Latn',
+            'hau_Latn-lug_Latn', 'hau_Latn-luo_Latn', 'hau_Latn-nso_Latn',
+            'hau_Latn-nus_Latn', 'hau_Latn-nya_Latn', 'hau_Latn-run_Latn',
+            'hau_Latn-sna_Latn', 'hau_Latn-som_Latn', 'hau_Latn-sot_Latn',
+            'hau_Latn-ssw_Latn', 'hau_Latn-swh_Latn', 'hau_Latn-tir_Ethi',
+            'hau_Latn-tsn_Latn', 'hau_Latn-tso_Latn', 'hau_Latn-tum_Latn',
+            'hau_Latn-twi_Latn', 'hau_Latn-umb_Latn', 'hau_Latn-wol_Latn',
+            'hau_Latn-xho_Latn', 'hau_Latn-yor_Latn', 'hau_Latn-zul_Latn',
+            'hin_Deva-hne_Deva', 'hin_Deva-kan_Knda', 'hin_Deva-kas_Arab',
+            'hin_Deva-kas_Deva', 'hin_Deva-mag_Deva', 'hin_Deva-mai_Deva',
+            'hin_Deva-mal_Mlym', 'hin_Deva-mar_Deva', 'hin_Deva-npi_Deva',
+            'hin_Deva-ory_Orya', 'hin_Deva-pan_Guru', 'hin_Deva-pbt_Arab',
+            'hin_Deva-san_Deva', 'hin_Deva-sat_Beng', 'hin_Deva-sin_Sinh',
+            'hin_Deva-snd_Arab', 'hin_Deva-tam_Taml', 'hin_Deva-tel_Telu',
+            'hin_Deva-urd_Arab', 'hne_Deva-kan_Knda', 'hne_Deva-kas_Arab',
+            'hne_Deva-kas_Deva', 'hne_Deva-mag_Deva', 'hne_Deva-mai_Deva',
+            'hne_Deva-mal_Mlym', 'hne_Deva-mar_Deva', 'hne_Deva-npi_Deva',
+            'hne_Deva-ory_Orya', 'hne_Deva-pan_Guru', 'hne_Deva-san_Deva',
+            'hne_Deva-sat_Beng', 'hne_Deva-sin_Sinh', 'hne_Deva-snd_Arab',
+            'hne_Deva-tam_Taml', 'hne_Deva-tel_Telu', 'hne_Deva-urd_Arab',
+            'hye_Armn-rus_Cyrl', 'ibo_Latn-gaz_Latn', 'ibo_Latn-kam_Latn',
+            'ibo_Latn-kik_Latn', 'ibo_Latn-kin_Latn', 'ibo_Latn-kmb_Latn',
+            'ibo_Latn-knc_Arab', 'ibo_Latn-knc_Latn', 'ibo_Latn-kon_Latn',
+            'ibo_Latn-lin_Latn', 'ibo_Latn-lua_Latn', 'ibo_Latn-lug_Latn',
+            'ibo_Latn-luo_Latn', 'ibo_Latn-nso_Latn', 'ibo_Latn-nus_Latn',
+            'ibo_Latn-nya_Latn', 'ibo_Latn-run_Latn', 'ibo_Latn-sna_Latn',
+            'ibo_Latn-som_Latn', 'ibo_Latn-sot_Latn', 'ibo_Latn-ssw_Latn',
+            'ibo_Latn-swh_Latn', 'ibo_Latn-tir_Ethi', 'ibo_Latn-tsn_Latn',
+            'ibo_Latn-tso_Latn', 'ibo_Latn-tum_Latn', 'ibo_Latn-twi_Latn',
+            'ibo_Latn-umb_Latn', 'ibo_Latn-wol_Latn', 'ibo_Latn-xho_Latn',
+            'ibo_Latn-yor_Latn', 'ibo_Latn-zul_Latn', 'ilo_Latn-jav_Latn',
+            'ilo_Latn-min_Latn', 'ilo_Latn-mri_Latn', 'ilo_Latn-pag_Latn',
+            'ilo_Latn-plt_Latn', 'ilo_Latn-smo_Latn', 'ilo_Latn-sun_Latn',
+            'ilo_Latn-war_Latn', 'ind_Latn-ace_Latn', 'ind_Latn-ban_Latn',
+            'ind_Latn-jav_Latn', 'ind_Latn-khm_Khmr', 'ind_Latn-lao_Laoo',
+            'ind_Latn-min_Latn', 'ind_Latn-mya_Mymr', 'ind_Latn-shn_Mymr',
+            'ind_Latn-sun_Latn', 'jav_Latn-min_Latn', 'jav_Latn-mri_Latn',
+            'jav_Latn-pag_Latn', 'jav_Latn-plt_Latn', 'jav_Latn-smo_Latn',
+            'jav_Latn-sun_Latn', 'jav_Latn-war_Latn', 'kam_Latn-gaz_Latn',
+            'kam_Latn-kik_Latn', 'kam_Latn-kin_Latn', 'kam_Latn-kmb_Latn',
+            'kam_Latn-knc_Arab', 'kam_Latn-knc_Latn', 'kam_Latn-kon_Latn',
+            'kam_Latn-lin_Latn', 'kam_Latn-lua_Latn', 'kam_Latn-lug_Latn',
+            'kam_Latn-luo_Latn', 'kam_Latn-nso_Latn', 'kam_Latn-nus_Latn',
+            'kam_Latn-nya_Latn', 'kam_Latn-run_Latn', 'kam_Latn-sna_Latn',
+            'kam_Latn-som_Latn', 'kam_Latn-sot_Latn', 'kam_Latn-ssw_Latn',
+            'kam_Latn-swh_Latn', 'kam_Latn-tir_Ethi', 'kam_Latn-tsn_Latn',
+            'kam_Latn-tso_Latn', 'kam_Latn-tum_Latn', 'kam_Latn-twi_Latn',
+            'kam_Latn-umb_Latn', 'kam_Latn-wol_Latn', 'kam_Latn-xho_Latn',
+            'kam_Latn-yor_Latn', 'kam_Latn-zul_Latn', 'kan_Knda-kas_Arab',
+            'kan_Knda-kas_Deva', 'kan_Knda-mag_Deva', 'kan_Knda-mai_Deva',
+            'kan_Knda-mal_Mlym', 'kan_Knda-mar_Deva', 'kan_Knda-npi_Deva',
+            'kan_Knda-ory_Orya', 'kan_Knda-pan_Guru', 'kan_Knda-san_Deva',
+            'kan_Knda-sat_Beng', 'kan_Knda-sin_Sinh', 'kan_Knda-snd_Arab',
+            'kan_Knda-tam_Taml', 'kan_Knda-tel_Telu', 'kan_Knda-urd_Arab',
+            'kas_Arab-kas_Deva', 'kas_Arab-mag_Deva', 'kas_Arab-mai_Deva',
+            'kas_Arab-mal_Mlym', 'kas_Arab-mar_Deva', 'kas_Arab-npi_Deva',
+            'kas_Arab-ory_Orya', 'kas_Arab-pan_Guru', 'kas_Arab-san_Deva',
+            'kas_Arab-sat_Beng', 'kas_Arab-sin_Sinh', 'kas_Arab-snd_Arab',
+            'kas_Arab-tam_Taml', 'kas_Arab-tel_Telu', 'kas_Arab-urd_Arab',
+            'kas_Deva-mag_Deva', 'kas_Deva-mai_Deva', 'kas_Deva-mal_Mlym',
+            'kas_Deva-mar_Deva', 'kas_Deva-npi_Deva', 'kas_Deva-ory_Orya',
+            'kas_Deva-pan_Guru', 'kas_Deva-san_Deva', 'kas_Deva-sat_Beng',
+            'kas_Deva-sin_Sinh', 'kas_Deva-snd_Arab', 'kas_Deva-tam_Taml',
+            'kas_Deva-tel_Telu', 'kas_Deva-urd_Arab', 'kat_Geor-rus_Cyrl',
+            'kea_Latn-por_Latn', 'kik_Latn-gaz_Latn', 'kik_Latn-kin_Latn',
+            'kik_Latn-kmb_Latn', 'kik_Latn-kon_Latn', 'kik_Latn-lin_Latn',
+            'kik_Latn-lua_Latn', 'kik_Latn-lug_Latn', 'kik_Latn-luo_Latn',
+            'kik_Latn-nso_Latn', 'kik_Latn-nus_Latn', 'kik_Latn-nya_Latn',
+            'kik_Latn-run_Latn', 'kik_Latn-sna_Latn', 'kik_Latn-som_Latn',
+            'kik_Latn-sot_Latn', 'kik_Latn-ssw_Latn', 'kik_Latn-swh_Latn',
+            'kik_Latn-tir_Ethi', 'kik_Latn-tsn_Latn', 'kik_Latn-tso_Latn',
+            'kik_Latn-tum_Latn', 'kik_Latn-twi_Latn', 'kik_Latn-umb_Latn',
+            'kik_Latn-wol_Latn', 'kik_Latn-xho_Latn', 'kik_Latn-yor_Latn',
+            'kik_Latn-zul_Latn', 'kin_Latn-gaz_Latn', 'kin_Latn-kmb_Latn',
+            'kin_Latn-kon_Latn', 'kin_Latn-lin_Latn', 'kin_Latn-lua_Latn',
+            'kin_Latn-lug_Latn', 'kin_Latn-luo_Latn', 'kin_Latn-nso_Latn',
+            'kin_Latn-nus_Latn', 'kin_Latn-nya_Latn', 'kin_Latn-run_Latn',
+            'kin_Latn-sna_Latn', 'kin_Latn-som_Latn', 'kin_Latn-sot_Latn',
+            'kin_Latn-ssw_Latn', 'kin_Latn-swh_Latn', 'kin_Latn-tir_Ethi',
+            'kin_Latn-tsn_Latn', 'kin_Latn-tso_Latn', 'kin_Latn-tum_Latn',
+            'kin_Latn-twi_Latn', 'kin_Latn-umb_Latn', 'kin_Latn-wol_Latn',
+            'kin_Latn-xho_Latn', 'kin_Latn-yor_Latn', 'kin_Latn-zul_Latn',
+            'kir_Cyrl-rus_Cyrl', 'kir_Cyrl-tat_Cyrl', 'kir_Cyrl-tuk_Latn',
+            'kir_Cyrl-uig_Arab', 'kir_Cyrl-uzn_Latn', 'kmb_Latn-gaz_Latn',
+            'kmb_Latn-kon_Latn', 'kmb_Latn-lin_Latn', 'kmb_Latn-lua_Latn',
+            'kmb_Latn-lug_Latn', 'kmb_Latn-luo_Latn', 'kmb_Latn-nso_Latn',
+            'kmb_Latn-nus_Latn', 'kmb_Latn-nya_Latn', 'kmb_Latn-por_Latn',
+            'kmb_Latn-run_Latn', 'kmb_Latn-sna_Latn', 'kmb_Latn-som_Latn',
+            'kmb_Latn-sot_Latn', 'kmb_Latn-ssw_Latn', 'kmb_Latn-swh_Latn',
+            'kmb_Latn-tir_Ethi', 'kmb_Latn-tsn_Latn', 'kmb_Latn-tso_Latn',
+            'kmb_Latn-tum_Latn', 'kmb_Latn-twi_Latn', 'kmb_Latn-umb_Latn',
+            'kmb_Latn-wol_Latn', 'kmb_Latn-xho_Latn', 'kmb_Latn-yor_Latn',
+            'kmb_Latn-zul_Latn', 'kmr_Latn-pbt_Arab', 'kmr_Latn-prs_Arab',
+            'kmr_Latn-tgk_Cyrl', 'knc_Arab-gaz_Latn', 'knc_Arab-kik_Latn',
+            'knc_Arab-kin_Latn', 'knc_Arab-kmb_Latn', 'knc_Arab-knc_Latn',
+            'knc_Arab-kon_Latn', 'knc_Arab-lin_Latn', 'knc_Arab-lua_Latn',
+            'knc_Arab-lug_Latn', 'knc_Arab-luo_Latn', 'knc_Arab-nso_Latn',
+            'knc_Arab-nus_Latn', 'knc_Arab-nya_Latn', 'knc_Arab-run_Latn',
+            'knc_Arab-sna_Latn', 'knc_Arab-som_Latn', 'knc_Arab-sot_Latn',
+            'knc_Arab-ssw_Latn', 'knc_Arab-swh_Latn', 'knc_Arab-tir_Ethi',
+            'knc_Arab-tsn_Latn', 'knc_Arab-tso_Latn', 'knc_Arab-tum_Latn',
+            'knc_Arab-twi_Latn', 'knc_Arab-umb_Latn', 'knc_Arab-wol_Latn',
+            'knc_Arab-xho_Latn', 'knc_Arab-yor_Latn', 'knc_Arab-zul_Latn',
+            'knc_Latn-gaz_Latn', 'knc_Latn-kik_Latn', 'knc_Latn-kin_Latn',
+            'knc_Latn-kmb_Latn', 'knc_Latn-kon_Latn', 'knc_Latn-lin_Latn',
+            'knc_Latn-lua_Latn', 'knc_Latn-lug_Latn', 'knc_Latn-luo_Latn',
+            'knc_Latn-nso_Latn', 'knc_Latn-nus_Latn', 'knc_Latn-nya_Latn',
+            'knc_Latn-run_Latn', 'knc_Latn-sna_Latn', 'knc_Latn-som_Latn',
+            'knc_Latn-sot_Latn', 'knc_Latn-ssw_Latn', 'knc_Latn-swh_Latn',
+            'knc_Latn-tir_Ethi', 'knc_Latn-tsn_Latn', 'knc_Latn-tso_Latn',
+            'knc_Latn-tum_Latn', 'knc_Latn-twi_Latn', 'knc_Latn-umb_Latn',
+            'knc_Latn-wol_Latn', 'knc_Latn-xho_Latn', 'knc_Latn-yor_Latn',
+            'knc_Latn-zul_Latn', 'kon_Latn-gaz_Latn', 'kon_Latn-lin_Latn',
+            'kon_Latn-lua_Latn', 'kon_Latn-lug_Latn', 'kon_Latn-luo_Latn',
+            'kon_Latn-nso_Latn', 'kon_Latn-nus_Latn', 'kon_Latn-nya_Latn',
+            'kon_Latn-run_Latn', 'kon_Latn-sna_Latn', 'kon_Latn-som_Latn',
+            'kon_Latn-sot_Latn', 'kon_Latn-ssw_Latn', 'kon_Latn-swh_Latn',
+            'kon_Latn-tir_Ethi', 'kon_Latn-tsn_Latn', 'kon_Latn-tso_Latn',
+            'kon_Latn-tum_Latn', 'kon_Latn-twi_Latn', 'kon_Latn-umb_Latn',
+            'kon_Latn-wol_Latn', 'kon_Latn-xho_Latn', 'kon_Latn-yor_Latn',
+            'kon_Latn-zul_Latn', 'lao_Laoo-rus_Cyrl', 'lin_Latn-gaz_Latn',
+            'lin_Latn-lua_Latn', 'lin_Latn-lug_Latn', 'lin_Latn-luo_Latn',
+            'lin_Latn-nso_Latn', 'lin_Latn-nus_Latn', 'lin_Latn-nya_Latn',
+            'lin_Latn-run_Latn', 'lin_Latn-sna_Latn', 'lin_Latn-som_Latn',
+            'lin_Latn-sot_Latn', 'lin_Latn-ssw_Latn', 'lin_Latn-swh_Latn',
+            'lin_Latn-tir_Ethi', 'lin_Latn-tsn_Latn', 'lin_Latn-tso_Latn',
+            'lin_Latn-tum_Latn', 'lin_Latn-twi_Latn', 'lin_Latn-umb_Latn',
+            'lin_Latn-wol_Latn', 'lin_Latn-xho_Latn', 'lin_Latn-yor_Latn',
+            'lin_Latn-zul_Latn', 'ltg_Latn-rus_Cyrl', 'lua_Latn-gaz_Latn',
+            'lua_Latn-lug_Latn', 'lua_Latn-luo_Latn', 'lua_Latn-nso_Latn',
+            'lua_Latn-nus_Latn', 'lua_Latn-nya_Latn', 'lua_Latn-run_Latn',
+            'lua_Latn-sna_Latn', 'lua_Latn-som_Latn', 'lua_Latn-sot_Latn',
+            'lua_Latn-ssw_Latn', 'lua_Latn-swh_Latn', 'lua_Latn-tir_Ethi',
+            'lua_Latn-tsn_Latn', 'lua_Latn-tso_Latn', 'lua_Latn-tum_Latn',
+            'lua_Latn-twi_Latn', 'lua_Latn-umb_Latn', 'lua_Latn-wol_Latn',
+            'lua_Latn-xho_Latn', 'lua_Latn-yor_Latn', 'lua_Latn-zul_Latn',
+            'lug_Latn-gaz_Latn', 'lug_Latn-luo_Latn', 'lug_Latn-nso_Latn',
+            'lug_Latn-nus_Latn', 'lug_Latn-nya_Latn', 'lug_Latn-run_Latn',
+            'lug_Latn-sna_Latn', 'lug_Latn-som_Latn', 'lug_Latn-sot_Latn',
+            'lug_Latn-ssw_Latn', 'lug_Latn-swh_Latn', 'lug_Latn-tir_Ethi',
+            'lug_Latn-tsn_Latn', 'lug_Latn-tso_Latn', 'lug_Latn-tum_Latn',
+            'lug_Latn-twi_Latn', 'lug_Latn-umb_Latn', 'lug_Latn-wol_Latn',
+            'lug_Latn-xho_Latn', 'lug_Latn-yor_Latn', 'lug_Latn-zul_Latn',
+            'luo_Latn-gaz_Latn', 'luo_Latn-nso_Latn', 'luo_Latn-nus_Latn',
+            'luo_Latn-nya_Latn', 'luo_Latn-run_Latn', 'luo_Latn-sna_Latn',
+            'luo_Latn-som_Latn', 'luo_Latn-sot_Latn', 'luo_Latn-ssw_Latn',
+            'luo_Latn-swh_Latn', 'luo_Latn-tir_Ethi', 'luo_Latn-tsn_Latn',
+            'luo_Latn-tso_Latn', 'luo_Latn-tum_Latn', 'luo_Latn-twi_Latn',
+            'luo_Latn-umb_Latn', 'luo_Latn-wol_Latn', 'luo_Latn-xho_Latn',
+            'luo_Latn-yor_Latn', 'luo_Latn-zul_Latn', 'mag_Deva-mai_Deva',
+            'mag_Deva-mal_Mlym', 'mag_Deva-mar_Deva', 'mag_Deva-npi_Deva',
+            'mag_Deva-ory_Orya', 'mag_Deva-pan_Guru', 'mag_Deva-san_Deva',
+            'mag_Deva-sat_Beng', 'mag_Deva-sin_Sinh', 'mag_Deva-snd_Arab',
+            'mag_Deva-tam_Taml', 'mag_Deva-tel_Telu', 'mag_Deva-urd_Arab',
+            'mai_Deva-mal_Mlym', 'mai_Deva-mar_Deva', 'mai_Deva-npi_Deva',
+            'mai_Deva-ory_Orya', 'mai_Deva-pan_Guru', 'mai_Deva-san_Deva',
+            'mai_Deva-sat_Beng', 'mai_Deva-sin_Sinh', 'mai_Deva-snd_Arab',
+            'mai_Deva-tam_Taml', 'mai_Deva-tel_Telu', 'mai_Deva-urd_Arab',
+            'mal_Mlym-mar_Deva', 'mal_Mlym-npi_Deva', 'mal_Mlym-ory_Orya',
+            'mal_Mlym-pan_Guru', 'mal_Mlym-san_Deva', 'mal_Mlym-sat_Beng',
+            'mal_Mlym-sin_Sinh', 'mal_Mlym-snd_Arab', 'mal_Mlym-tam_Taml',
+            'mal_Mlym-tel_Telu', 'mal_Mlym-urd_Arab', 'mar_Deva-npi_Deva',
+            'mar_Deva-ory_Orya', 'mar_Deva-pan_Guru', 'mar_Deva-san_Deva',
+            'mar_Deva-sat_Beng', 'mar_Deva-sin_Sinh', 'mar_Deva-snd_Arab',
+            'mar_Deva-tam_Taml', 'mar_Deva-tel_Telu', 'mar_Deva-urd_Arab',
+            'min_Latn-mri_Latn', 'min_Latn-pag_Latn', 'min_Latn-plt_Latn',
+            'min_Latn-smo_Latn', 'min_Latn-sun_Latn', 'min_Latn-war_Latn',
+            'mri_Latn-pag_Latn', 'mri_Latn-smo_Latn', 'mri_Latn-sun_Latn',
+            'mri_Latn-war_Latn', 'npi_Deva-ory_Orya', 'npi_Deva-pan_Guru',
+            'npi_Deva-san_Deva', 'npi_Deva-sat_Beng', 'npi_Deva-sin_Sinh',
+            'npi_Deva-snd_Arab', 'npi_Deva-tam_Taml', 'npi_Deva-tel_Telu',
+            'npi_Deva-urd_Arab', 'nso_Latn-gaz_Latn', 'nso_Latn-nus_Latn',
+            'nso_Latn-nya_Latn', 'nso_Latn-run_Latn', 'nso_Latn-sna_Latn',
+            'nso_Latn-som_Latn', 'nso_Latn-sot_Latn', 'nso_Latn-ssw_Latn',
+            'nso_Latn-swh_Latn', 'nso_Latn-tir_Ethi', 'nso_Latn-tsn_Latn',
+            'nso_Latn-tso_Latn', 'nso_Latn-tum_Latn', 'nso_Latn-twi_Latn',
+            'nso_Latn-umb_Latn', 'nso_Latn-wol_Latn', 'nso_Latn-xho_Latn',
+            'nso_Latn-yor_Latn', 'nso_Latn-zul_Latn', 'nus_Latn-gaz_Latn',
+            'nus_Latn-nya_Latn', 'nus_Latn-run_Latn', 'nus_Latn-sna_Latn',
+            'nus_Latn-som_Latn', 'nus_Latn-sot_Latn', 'nus_Latn-ssw_Latn',
+            'nus_Latn-swh_Latn', 'nus_Latn-tir_Ethi', 'nus_Latn-tsn_Latn',
+            'nus_Latn-tso_Latn', 'nus_Latn-tum_Latn', 'nus_Latn-twi_Latn',
+            'nus_Latn-umb_Latn', 'nus_Latn-wol_Latn', 'nus_Latn-xho_Latn',
+            'nus_Latn-yor_Latn', 'nus_Latn-zul_Latn', 'nya_Latn-gaz_Latn',
+            'nya_Latn-run_Latn', 'nya_Latn-sna_Latn', 'nya_Latn-som_Latn',
+            'nya_Latn-sot_Latn', 'nya_Latn-ssw_Latn', 'nya_Latn-swh_Latn',
+            'nya_Latn-tir_Ethi', 'nya_Latn-tsn_Latn', 'nya_Latn-tso_Latn',
+            'nya_Latn-tum_Latn', 'nya_Latn-twi_Latn', 'nya_Latn-umb_Latn',
+            'nya_Latn-wol_Latn', 'nya_Latn-xho_Latn', 'nya_Latn-yor_Latn',
+            'nya_Latn-zul_Latn', 'oci_Latn-por_Latn', 'ory_Orya-pan_Guru',
+            'ory_Orya-san_Deva', 'ory_Orya-sat_Beng', 'ory_Orya-sin_Sinh',
+            'ory_Orya-snd_Arab', 'ory_Orya-tam_Taml', 'ory_Orya-tel_Telu',
+            'ory_Orya-urd_Arab', 'pag_Latn-smo_Latn', 'pag_Latn-sun_Latn',
+            'pan_Guru-san_Deva', 'pan_Guru-sat_Beng', 'pan_Guru-sin_Sinh',
+            'pan_Guru-snd_Arab', 'pan_Guru-tam_Taml', 'pan_Guru-tel_Telu',
+            'pan_Guru-urd_Arab', 'pbt_Arab-tam_Taml', 'pbt_Arab-tgk_Cyrl',
+            'plt_Latn-mri_Latn', 'plt_Latn-pag_Latn', 'plt_Latn-smo_Latn',
+            'plt_Latn-sun_Latn', 'plt_Latn-war_Latn', 'por_Latn-ayr_Latn',
+            'por_Latn-quy_Latn', 'prs_Arab-pbt_Arab', 'prs_Arab-tgk_Cyrl',
+            'quy_Latn-spa_Latn', 'run_Latn-sna_Latn', 'run_Latn-som_Latn',
+            'run_Latn-sot_Latn', 'run_Latn-ssw_Latn', 'run_Latn-swh_Latn',
+            'run_Latn-tir_Ethi', 'run_Latn-tsn_Latn', 'run_Latn-tso_Latn',
+            'run_Latn-tum_Latn', 'run_Latn-twi_Latn', 'run_Latn-umb_Latn',
+            'run_Latn-wol_Latn', 'run_Latn-xho_Latn', 'run_Latn-yor_Latn',
+            'run_Latn-zul_Latn', 'rus_Cyrl-tat_Cyrl', 'rus_Cyrl-tgk_Cyrl',
+            'san_Deva-sat_Beng', 'san_Deva-sin_Sinh', 'san_Deva-snd_Arab',
+            'san_Deva-tam_Taml', 'san_Deva-tel_Telu', 'san_Deva-urd_Arab',
+            'sat_Beng-sin_Sinh', 'sat_Beng-snd_Arab', 'sat_Beng-tam_Taml',
+            'sat_Beng-tel_Telu', 'sat_Beng-urd_Arab', 'sin_Sinh-snd_Arab',
+            'sin_Sinh-tam_Taml', 'sin_Sinh-tel_Telu', 'sin_Sinh-urd_Arab',
+            'smo_Latn-sun_Latn', 'smo_Latn-war_Latn', 'sna_Latn-som_Latn',
+            'sna_Latn-sot_Latn', 'sna_Latn-ssw_Latn', 'sna_Latn-swh_Latn',
+            'sna_Latn-tir_Ethi', 'sna_Latn-tsn_Latn', 'sna_Latn-tso_Latn',
+            'sna_Latn-tum_Latn', 'sna_Latn-twi_Latn', 'sna_Latn-umb_Latn',
+            'sna_Latn-wol_Latn', 'sna_Latn-xho_Latn', 'sna_Latn-yor_Latn',
+            'sna_Latn-zul_Latn', 'snd_Arab-tam_Taml', 'snd_Arab-tel_Telu',
+            'snd_Arab-urd_Arab', 'som_Latn-sot_Latn', 'som_Latn-ssw_Latn',
+            'som_Latn-swh_Latn', 'som_Latn-tir_Ethi', 'som_Latn-tsn_Latn',
+            'som_Latn-tso_Latn', 'som_Latn-tum_Latn', 'som_Latn-twi_Latn',
+            'som_Latn-umb_Latn', 'som_Latn-wol_Latn', 'som_Latn-xho_Latn',
+            'som_Latn-yor_Latn', 'som_Latn-zul_Latn', 'sot_Latn-ssw_Latn',
+            'sot_Latn-swh_Latn', 'sot_Latn-tir_Ethi', 'sot_Latn-tsn_Latn',
+            'sot_Latn-tso_Latn', 'sot_Latn-tum_Latn', 'sot_Latn-twi_Latn',
+            'sot_Latn-umb_Latn', 'sot_Latn-wol_Latn', 'sot_Latn-xho_Latn',
+            'sot_Latn-yor_Latn', 'sot_Latn-zul_Latn', 'ssw_Latn-swh_Latn',
+            'ssw_Latn-tir_Ethi', 'ssw_Latn-tsn_Latn', 'ssw_Latn-tso_Latn',
+            'ssw_Latn-tum_Latn', 'ssw_Latn-twi_Latn', 'ssw_Latn-umb_Latn',
+            'ssw_Latn-wol_Latn', 'ssw_Latn-xho_Latn', 'ssw_Latn-yor_Latn',
+            'ssw_Latn-zul_Latn', 'sun_Latn-war_Latn', 'swh_Latn-tir_Ethi',
+            'swh_Latn-tsn_Latn', 'swh_Latn-tso_Latn', 'swh_Latn-tum_Latn',
+            'swh_Latn-twi_Latn', 'swh_Latn-umb_Latn', 'swh_Latn-wol_Latn',
+            'swh_Latn-xho_Latn', 'swh_Latn-yor_Latn', 'swh_Latn-zul_Latn',
+            'tam_Taml-tel_Telu', 'tam_Taml-urd_Arab', 'tat_Cyrl-tuk_Latn',
+            'tat_Cyrl-uig_Arab', 'tat_Cyrl-uzn_Latn', 'tel_Telu-urd_Arab',
+            'tir_Ethi-tsn_Latn', 'tir_Ethi-tso_Latn', 'tir_Ethi-tum_Latn',
+            'tir_Ethi-twi_Latn', 'tir_Ethi-umb_Latn', 'tir_Ethi-wol_Latn',
+            'tir_Ethi-xho_Latn', 'tir_Ethi-yor_Latn', 'tir_Ethi-zul_Latn',
+            'tsn_Latn-tso_Latn', 'tsn_Latn-tum_Latn', 'tsn_Latn-twi_Latn',
+            'tsn_Latn-umb_Latn', 'tsn_Latn-wol_Latn', 'tsn_Latn-xho_Latn',
+            'tsn_Latn-yor_Latn', 'tsn_Latn-zul_Latn', 'tso_Latn-tum_Latn',
+            'tso_Latn-twi_Latn', 'tso_Latn-umb_Latn', 'tso_Latn-wol_Latn',
+            'tso_Latn-xho_Latn', 'tso_Latn-yor_Latn', 'tso_Latn-zul_Latn',
+            'tuk_Latn-uig_Arab', 'tuk_Latn-uzn_Latn', 'tum_Latn-twi_Latn',
+            'tum_Latn-umb_Latn', 'tum_Latn-wol_Latn', 'tum_Latn-xho_Latn',
+            'tum_Latn-yor_Latn', 'tum_Latn-zul_Latn', 'twi_Latn-umb_Latn',
+            'twi_Latn-wol_Latn', 'twi_Latn-xho_Latn', 'twi_Latn-yor_Latn',
+            'twi_Latn-zul_Latn', 'uig_Arab-uzn_Latn', 'umb_Latn-wol_Latn',
+            'umb_Latn-xho_Latn', 'umb_Latn-yor_Latn', 'umb_Latn-zul_Latn',
+            'wol_Latn-xho_Latn', 'wol_Latn-yor_Latn', 'wol_Latn-zul_Latn',
+            'xho_Latn-yor_Latn', 'xho_Latn-zul_Latn', 'yor_Latn-zul_Latn'
+        ]
+        subset = subset[:self.subset_count]
+        for subset_name in subset:
+            self.download_subset('nllb', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_universal_dependencies(self):
+        subset = [
+            'af_afribooms', 'akk_pisandub', 'akk_riao', 'aqz_tudet', 'sq_tsa',
+            'am_att', 'grc_perseus', 'grc_proiel', 'apu_ufpa', 'ar_nyuad',
+            'ar_padt', 'ar_pud', 'hy_armtdp', 'aii_as', 'bm_crb', 'eu_bdt',
+            'be_hse', 'bho_bhtb', 'br_keb', 'bg_btb', 'bxr_bdt', 'yue_hk',
+            'ca_ancora', 'zh_cfl', 'zh_gsd', 'zh_gsdsimp', 'zh_hk', 'zh_pud',
+            'ckt_hse', 'lzh_kyoto', 'cop_scriptorium', 'hr_set', 'cs_cac',
+            'cs_cltt', 'cs_fictree', 'cs_pdt', 'cs_pud', 'da_ddt', 'nl_alpino',
+            'nl_lassysmall', 'en_esl', 'en_ewt', 'en_gum', 'en_gumreddit',
+            'en_lines', 'en_partut', 'en_pronouns', 'en_pud', 'myv_jr',
+            'et_edt', 'et_ewt', 'fo_farpahc', 'fo_oft', 'fi_ftb', 'fi_ood',
+            'fi_pud', 'fi_tdt', 'fr_fqb', 'fr_ftb', 'fr_gsd', 'fr_partut',
+            'fr_pud', 'fr_sequoia', 'fr_spoken', 'gl_ctg', 'gl_treegal',
+            'de_gsd', 'de_hdt', 'de_lit', 'de_pud', 'got_proiel', 'el_gdt',
+            'he_htb', 'qhe_hiencs', 'hi_hdtb', 'hi_pud', 'hu_szeged',
+            'is_icepahc', 'is_pud', 'id_csui', 'id_gsd', 'id_pud', 'ga_idt',
+            'it_isdt', 'it_partut', 'it_postwita', 'it_pud', 'it_twittiro',
+            'it_vit', 'ja_bccwj', 'ja_gsd', 'ja_modern', 'ja_pud', 'krl_kkpp',
+            'kk_ktb', 'kfm_aha', 'koi_uh', 'kpv_ikdp', 'kpv_lattice', 'ko_gsd',
+            'ko_kaist', 'ko_pud', 'kmr_mg', 'la_ittb', 'la_llct', 'la_perseus',
+            'la_proiel', 'lv_lvtb', 'lt_alksnis', 'lt_hse', 'olo_kkpp',
+            'mt_mudt', 'gv_cadhan', 'mr_ufal', 'gun_dooley', 'gun_thomas',
+            'mdf_jr', 'myu_tudet', 'pcm_nsc', 'nyq_aha', 'sme_giella',
+            'no_bokmaal', 'no_nynorsk', 'no_nynorsklia', 'cu_proiel',
+            'fro_srcmf', 'orv_rnc', 'orv_torot', 'otk_tonqq', 'fa_perdt',
+            'fa_seraji', 'pl_lfg', 'pl_pdb', 'pl_pud', 'pt_bosque', 'pt_gsd',
+            'pt_pud', 'ro_nonstandard', 'ro_rrt', 'ro_simonero', 'ru_gsd',
+            'ru_pud', 'ru_syntagrus', 'ru_taiga', 'sa_ufal', 'sa_vedic',
+            'gd_arcosg', 'sr_set', 'sms_giellagas', 'sk_snk', 'sl_ssj',
+            'sl_sst', 'soj_aha', 'ajp_madar', 'es_ancora', 'es_gsd', 'es_pud',
+            'swl_sslc', 'sv_lines', 'sv_pud', 'sv_talbanken', 'gsw_uzh',
+            'tl_trg', 'tl_ugnayan', 'ta_mwtt', 'ta_ttb', 'te_mtg', 'th_pud',
+            'tpn_tudet', 'qtd_sagt', 'tr_boun', 'tr_gb', 'tr_imst', 'tr_pud',
+            'uk_iu', 'hsb_ufal', 'ur_udtb', 'ug_udt', 'vi_vtb', 'wbp_ufal',
+            'cy_ccg', 'wo_wtb', 'yo_ytb'
+        ]
+        subset = subset[:self.subset_count]
+        for subset_name in subset:
+            self.download_subset('universal_dependencies', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_imdb(self):
+        dataset = MsDataset.load('imdb')
+        if isinstance(dataset, MsDataset):
+            lens = len(dataset)
+            print(f'dataset imdb len: {lens}')
+            self.assertTrue(lens > 0)
+        else:
+            assert isinstance(dataset, dict)
+            lens = {key: len(subset) for key, subset in dataset.items()}
+            print(f'dataset imdb len: {lens}')
+            self.assertTrue(all([_len > 0 for _len in lens.values()]))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_clue(self):
+        subset = [
+            'afqmc', 'tnews', 'iflytek', 'cmnli', 'cluewsc2020', 'csl',
+            'cmrc2018', 'drcd', 'chid', 'c3', 'ocnli', 'diagnostics'
+        ]
+        for subset_name in subset:
+            self.download_subset('clue', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_wikitext(self):
+        subset = [
+            'wikitext-103-v1', 'wikitext-2-v1', 'wikitext-103-raw-v1',
+            'wikitext-2-raw-v1'
+        ]
+        for subset_name in subset:
+            self.download_subset('wikitext', subset_name)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_download_xnli(self):
+        subset = [
+            'XNLI', 'tydiqa', 'SQuAD', 'PAN-X.af', 'PAN-X.ar', 'PAN-X.bg',
+            'PAN-X.bn', 'PAN-X.de', 'PAN-X.el', 'PAN-X.en', 'PAN-X.es',
+            'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi', 'PAN-X.fr',
+            'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it',
+            'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko',
+            'PAN-X.ml', 'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl',
+            'PAN-X.pt', 'PAN-X.ru', 'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te',
+            'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr', 'PAN-X.ur', 'PAN-X.vi',
+            'PAN-X.yo', 'PAN-X.zh', 'MLQA.ar.ar', 'MLQA.ar.de', 'MLQA.ar.vi',
+            'MLQA.ar.zh', 'MLQA.ar.en', 'MLQA.ar.es', 'MLQA.ar.hi',
+            'MLQA.de.ar', 'MLQA.de.de', 'MLQA.de.vi', 'MLQA.de.zh',
+            'MLQA.de.en', 'MLQA.de.es', 'MLQA.de.hi', 'MLQA.vi.ar',
+            'MLQA.vi.de', 'MLQA.vi.vi', 'MLQA.vi.zh', 'MLQA.vi.en',
+            'MLQA.vi.es', 'MLQA.vi.hi', 'MLQA.zh.ar', 'MLQA.zh.de',
+            'MLQA.zh.vi', 'MLQA.zh.zh', 'MLQA.zh.en', 'MLQA.zh.es',
+            'MLQA.zh.hi', 'MLQA.en.ar', 'MLQA.en.de', 'MLQA.en.vi',
+            'MLQA.en.zh', 'MLQA.en.en', 'MLQA.en.es', 'MLQA.en.hi',
+            'MLQA.es.ar', 'MLQA.es.de', 'MLQA.es.vi', 'MLQA.es.zh',
+            'MLQA.es.en', 'MLQA.es.es', 'MLQA.es.hi', 'MLQA.hi.ar',
+            'MLQA.hi.de', 'MLQA.hi.vi', 'MLQA.hi.zh', 'MLQA.hi.en',
+            'MLQA.hi.es', 'MLQA.hi.hi', 'XQuAD.ar', 'XQuAD.de', 'XQuAD.vi',
+            'XQuAD.zh', 'XQuAD.en', 'XQuAD.es', 'XQuAD.hi', 'XQuAD.el',
+            'XQuAD.ru', 'XQuAD.th', 'XQuAD.tr', 'bucc18.de', 'bucc18.fr',
+            'bucc18.zh', 'bucc18.ru', 'PAWS-X.de', 'PAWS-X.en', 'PAWS-X.es',
+            'PAWS-X.fr', 'PAWS-X.ja', 'PAWS-X.ko', 'PAWS-X.zh', 'tatoeba.afr',
+            'tatoeba.ara', 'tatoeba.ben', 'tatoeba.bul', 'tatoeba.deu',
+            'tatoeba.cmn', 'tatoeba.ell', 'tatoeba.est', 'tatoeba.eus',
+            'tatoeba.fin', 'tatoeba.fra', 'tatoeba.heb', 'tatoeba.hin',
+            'tatoeba.hun', 'tatoeba.ind', 'tatoeba.ita', 'tatoeba.jav',
+            'tatoeba.jpn', 'tatoeba.kat', 'tatoeba.kaz', 'tatoeba.kor',
+            'tatoeba.mal', 'tatoeba.mar', 'tatoeba.nld', 'tatoeba.pes',
+            'tatoeba.por', 'tatoeba.rus', 'tatoeba.spa', 'tatoeba.swh',
+            'tatoeba.tam', 'tatoeba.tel', 'tatoeba.tgl', 'tatoeba.tha',
+            'tatoeba.tur', 'tatoeba.urd', 'tatoeba.vie', 'udpos.Afrikaans',
+            'udpos.Arabic', 'udpos.Basque', 'udpos.Bulgarian', 'udpos.Dutch',
+            'udpos.English', 'udpos.Estonian', 'udpos.Finnish', 'udpos.French',
+            'udpos.German', 'udpos.Greek', 'udpos.Hebrew', 'udpos.Hindi',
+            'udpos.Hungarian', 'udpos.Indonesian', 'udpos.Italian',
+            'udpos.Japanese', 'udpos.Kazakh', 'udpos.Korean', 'udpos.Chinese',
+            'udpos.Marathi', 'udpos.Persian', 'udpos.Portuguese',
+            'udpos.Russian', 'udpos.Spanish', 'udpos.Tagalog', 'udpos.Tamil',
+            'udpos.Telugu', 'udpos.Thai', 'udpos.Turkish', 'udpos.Urdu',
+            'udpos.Vietnamese', 'udpos.Yoruba'
+        ]
+        subset = subset[:self.subset_count]
+        for subset_name in subset:
+            self.download_subset('xtreme', subset_name)
diff --git a/tests/models/test_deberta_v2_backbone.py b/tests/models/test_deberta_v2_backbone.py
new file mode 100644
index 00000000..706b18f8
--- /dev/null
+++ b/tests/models/test_deberta_v2_backbone.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.nlp.deberta_v2 import (DebertaV2ForMaskedLM,
+                                              DebertaV2Model)
+from modelscope.utils.constant import Tasks
+
+
+class DebertaV2BackboneTest(unittest.TestCase):
+
+    def test_load_model(self):
+        model = Model.from_pretrained(
+            'damo/nlp_debertav2_fill-mask_chinese-lite')
+        self.assertTrue(model.__class__ == DebertaV2ForMaskedLM)
+        model = Model.from_pretrained(
+            'damo/nlp_debertav2_fill-mask_chinese-lite', task=Tasks.backbone)
+        self.assertTrue(model.__class__ == DebertaV2Model)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/outputs/__init__.py b/tests/outputs/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/outputs/test_model_outputs.py b/tests/outputs/test_model_outputs.py
new file mode 100644
index 00000000..31271869
--- /dev/null
+++ b/tests/outputs/test_model_outputs.py
@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import torch
+
+from modelscope.outputs import TextClassificationModelOutput
+from modelscope.utils.test_utils import test_level
+
+
+class TestModelOutput(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_model_outputs(self):
+        outputs = TextClassificationModelOutput(logits=torch.Tensor([1]))
+        self.assertEqual(outputs['logits'], torch.Tensor([1]))
+        self.assertEqual(outputs[0], torch.Tensor([1]))
+        self.assertEqual(outputs.logits, torch.Tensor([1]))
+        logits, loss = outputs
+        self.assertEqual(logits, torch.Tensor([1]))
+        self.assertTrue(loss is None)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/nlp/test_faq.py b/tests/pipelines/nlp/test_faq.py
new file mode 100644
index 00000000..8bac55d4
--- /dev/null
+++ b/tests/pipelines/nlp/test_faq.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForFaqRanking, SbertForFaqRetrieval
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import FaqPipeline
+from modelscope.preprocessors import FaqPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FaqTest(unittest.TestCase):
+    model_id = '/Users/tanfan/Desktop/Workdir/Gitlab/maas/MaaS-lib/.faq_test_model'
+    param = {
+        'query_set': ['明天星期几', '今天星期六', '今天星期六'],
+        'support_set': [{
+            'text': '今天星期六',
+            'label': 'label0'
+        }, {
+            'text': '明天星期几',
+            'label': 'label1'
+        }]
+    }
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    # def test_run_with_direct_file_download(self):
+    #     cache_path = self.model_id  # snapshot_download(self.model_id)
+    #     preprocessor = FaqPreprocessor(cache_path)
+    #     model = SbertForFaq(cache_path)
+    #     pipeline_ins = FaqPipeline(model, preprocessor=preprocessor)
+    #
+    #     result = pipeline_ins(self.param)
+    #     print(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = FaqPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.faq, model=model, preprocessor=preprocessor)
+        result = pipeline_ins(self.param)
+        print(result)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_run_with_model_name(self):
+    #     pipeline_ins = pipeline(task=Tasks.faq, model=self.model_id)
+    #     result = pipeline_ins(self.param)
+    #     print(result)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    # def test_run_with_default_model(self):
+    #     pipeline_ins = pipeline(task=Tasks.faq)
+    #     print(pipeline_ins(self.param))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_conversational_text_to_sql.py b/tests/pipelines/test_conversational_text_to_sql.py
index 21a4e0ce..17fffcaf 100644
--- a/tests/pipelines/test_conversational_text_to_sql.py
+++ b/tests/pipelines/test_conversational_text_to_sql.py
@@ -9,7 +9,8 @@ from modelscope.pipelines.nlp import ConversationalTextToSqlPipeline
 from modelscope.preprocessors import ConversationalTextToSqlPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.nlp.nlp_utils import text2sql_tracking_and_print_results
+from modelscope.utils.nlp.space_T_en.utils import \
+    text2sql_tracking_and_print_results
 from modelscope.utils.test_utils import test_level
 
 
diff --git a/tests/pipelines/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py
index 5894297f..2ee46388 100644
--- a/tests/pipelines/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -25,7 +25,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id, revision='update')
+        cache_path = snapshot_download(self.model_id)
         preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
         model = SpaceForDialogIntent(
             model_dir=cache_path,
@@ -46,7 +46,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id, revision='update')
+        model = Model.from_pretrained(self.model_id)
         preprocessor = DialogIntentPredictionPreprocessor(
             model_dir=model.model_dir)
 
@@ -64,10 +64,7 @@ class DialogIntentPredictionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=self.task, model=self.model_id, model_revision='update')
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         for my_pipeline, item in list(zip(pipelines, self.test_case)):
             print(my_pipeline(item))
 
diff --git a/tests/pipelines/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py
index 19d6ed2f..6b6259ce 100644
--- a/tests/pipelines/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -115,8 +115,7 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
 
-        cache_path = snapshot_download(
-            self.model_id, revision='task_oriented_conversation')
+        cache_path = snapshot_download(self.model_id)
 
         preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
         model = SpaceForDialogModeling(
@@ -130,8 +129,7 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(
-            self.model_id, revision='task_oriented_conversation')
+        model = Model.from_pretrained(self.model_id)
         preprocessor = DialogModelingPreprocessor(model_dir=model.model_dir)
 
         pipelines = [
@@ -142,20 +140,12 @@ class DialogModelingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=self.task,
-                model=self.model_id,
-                model_revision='task_oriented_conversation')
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         self.generate_and_print_dialog_response(pipelines)
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipelines = [
-            pipeline(
-                task=self.task, model_revision='task_oriented_conversation')
-        ]
+        pipelines = [pipeline(task=self.task)]
         self.generate_and_print_dialog_response(pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
diff --git a/tests/pipelines/test_dialog_state_tracking.py b/tests/pipelines/test_dialog_state_tracking.py
index 81bdd9be..6cdd5ee7 100644
--- a/tests/pipelines/test_dialog_state_tracking.py
+++ b/tests/pipelines/test_dialog_state_tracking.py
@@ -3,13 +3,14 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SpaceForDialogStateTracking
+from modelscope.models.nlp import SpaceForDST
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import DialogStateTrackingPipeline
 from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
-from modelscope.utils.nlp.nlp_utils import tracking_and_print_dialog_states
+from modelscope.utils.nlp.space.utils_dst import \
+    tracking_and_print_dialog_states
 from modelscope.utils.test_utils import test_level
 
 
@@ -85,9 +86,9 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        cache_path = snapshot_download(self.model_id, revision='update')
+        cache_path = snapshot_download(self.model_id)
 
-        model = SpaceForDialogStateTracking(cache_path)
+        model = SpaceForDST.from_pretrained(cache_path)
         preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path)
         pipelines = [
             DialogStateTrackingPipeline(
@@ -101,7 +102,7 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id, revision='update')
+        model = Model.from_pretrained(self.model_id)
 
         preprocessor = DialogStateTrackingPreprocessor(
             model_dir=model.model_dir)
@@ -115,10 +116,7 @@ class DialogStateTrackingTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
-        pipelines = [
-            pipeline(
-                task=self.task, model=self.model_id, model_revision='update')
-        ]
+        pipelines = [pipeline(task=self.task, model=self.model_id)]
         tracking_and_print_dialog_states(self.test_case, pipelines)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
diff --git a/tests/pipelines/test_faq_question_answering.py b/tests/pipelines/test_faq_question_answering.py
index 7eea0ddf..2f66f516 100644
--- a/tests/pipelines/test_faq_question_answering.py
+++ b/tests/pipelines/test_faq_question_answering.py
@@ -47,9 +47,9 @@ class FaqQuestionAnsweringTest(unittest.TestCase, DemoCompatibilityCheck):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
-        preprocessor = FaqQuestionAnsweringPreprocessor(cache_path)
-        model = SbertForFaqQuestionAnswering(cache_path)
-        model.load_checkpoint(cache_path)
+        preprocessor = FaqQuestionAnsweringPreprocessor.from_pretrained(
+            cache_path)
+        model = SbertForFaqQuestionAnswering.from_pretrained(cache_path)
         pipeline_ins = FaqQuestionAnsweringPipeline(
             model, preprocessor=preprocessor)
         result = pipeline_ins(self.param)
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 0e5e242b..568865c6 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -5,8 +5,7 @@ from regex import R
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
-                                   VecoForMaskedLM)
+from modelscope.models.nlp import SbertForMaskedLM, VecoForMaskedLM
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import FillMaskPipeline
 from modelscope.preprocessors import NLPPreprocessor
@@ -55,7 +54,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             model_dir = snapshot_download(self.model_id_sbert[language])
             preprocessor = NLPPreprocessor(
                 model_dir, first_sequence='sentence', second_sequence=None)
-            model = StructBertForMaskedLM.from_pretrained(model_dir)
+            model = SbertForMaskedLM.from_pretrained(model_dir)
             pipeline1 = FillMaskPipeline(model, preprocessor)
             pipeline2 = pipeline(
                 Tasks.fill_mask, model=model, preprocessor=preprocessor)
@@ -130,18 +129,6 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
                     f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                     f'{pipeline_ins(test_input)}\n')
 
-        # bert
-        language = 'zh'
-        model = Model.from_pretrained(self.model_id_bert, revision='beta')
-        preprocessor = NLPPreprocessor(
-            model.model_dir, first_sequence='sentence', second_sequence=None)
-        pipeline_ins = pipeline(
-            Tasks.fill_mask, model=model, preprocessor=preprocessor)
-        pipeline_ins.model, f'fill_mask_bert_{language}'
-        print(
-            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
-            f'{pipeline_ins(self.test_inputs[language])}\n')
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
index db4b9912..5f2dcb25 100644
--- a/tests/pipelines/test_nli.py
+++ b/tests/pipelines/test_nli.py
@@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -27,9 +27,8 @@ class NLITest(unittest.TestCase, DemoCompatibilityCheck):
     def test_run_with_direct_file_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(cache_path)
-        model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = SequenceClassificationPipeline(
-            model, preprocessor=tokenizer)
+        model = Model.from_pretrained(cache_path)
+        pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
               f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
diff --git a/tests/pipelines/test_part_of_speech.py b/tests/pipelines/test_part_of_speech.py
index 61cdfe73..038a90f0 100644
--- a/tests/pipelines/test_part_of_speech.py
+++ b/tests/pipelines/test_part_of_speech.py
@@ -23,7 +23,7 @@ class PartOfSpeechTest(unittest.TestCase):
         model = TokenClassificationModel.from_pretrained(cache_path)
         pipeline1 = TokenClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
-            Tasks.token_classification, model=model, preprocessor=tokenizer)
+            Tasks.part_of_speech, model=model, preprocessor=tokenizer)
         print(f'sentence: {self.sentence}\n'
               f'pipeline1:{pipeline1(input=self.sentence)}')
         print()
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index 739dd7ab..e96724a8 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -4,7 +4,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import SentenceEmbedding
+from modelscope.models.nlp import BertForSentenceEmbedding
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import SentenceEmbeddingPipeline
 from modelscope.preprocessors import SentenceEmbeddingPreprocessor
@@ -40,7 +40,7 @@ class SentenceEmbeddingTest(unittest.TestCase):
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SentenceEmbeddingPreprocessor(cache_path)
-        model = SentenceEmbedding.from_pretrained(cache_path)
+        model = BertForSentenceEmbedding.from_pretrained(cache_path)
         pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index 288d38c7..76db0a8f 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -5,7 +5,7 @@ from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -28,8 +28,7 @@ class SentenceSimilarityTest(unittest.TestCase, DemoCompatibilityCheck):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
-        pipeline1 = SequenceClassificationPipeline(
-            model, preprocessor=tokenizer)
+        pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.sentence_similarity, model=model, preprocessor=tokenizer)
         print('test1')
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index d0b1b40f..b3d9b9d6 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -6,7 +6,7 @@ from modelscope.models import Model
 from modelscope.models.nlp.task_models.sequence_classification import \
     SequenceClassificationModel
 from modelscope.pipelines import pipeline
-from modelscope.pipelines.nlp import SequenceClassificationPipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -28,8 +28,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
         tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
             self.model_id, num_labels=2, revision='beta')
-        pipeline1 = SequenceClassificationPipeline(
-            model, preprocessor=tokenizer)
+        pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.text_classification, model=model, preprocessor=tokenizer)
         print(f'sentence1: {self.sentence1}\n'
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index 44f1531b..eece7f57 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -13,7 +13,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline
 from modelscope.preprocessors import TableQuestionAnsweringPreprocessor
-from modelscope.preprocessors.space_T_cn.fields.database import Database
+from modelscope.preprocessors.nlp.space_T_cn.fields.database import Database
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.test_utils import test_level
 
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
new file mode 100644
index 00000000..5b38e116
--- /dev/null
+++ b/tests/pipelines/test_text_classification.py
@@ -0,0 +1,100 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class SequenceClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
+    sentence1 = 'i like this wonderful place'
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/bert-base-sst2'
+        self.task = Tasks.text_classification
+
+    def predict(self, pipeline_ins: TextClassificationPipeline):
+        from easynlp.appzoo import load_dataset
+
+        set = load_dataset('glue', 'sst2')
+        data = set['test']['sentence'][:3]
+
+        results = pipeline_ins(data[0])
+        print(results)
+        results = pipeline_ins(data[1])
+        print(results)
+
+        print(data)
+
+    def printDataset(self, dataset: MsDataset):
+        for i, r in enumerate(dataset):
+            if i > 10:
+                break
+            print(r)
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = SequenceClassificationPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification,
+            model=model,
+            preprocessor=preprocessor)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline_ins(input=self.sentence1)}')
+
+    # @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_model_name(self):
+        text_classification = pipeline(
+            task=Tasks.text_classification, model=self.model_id)
+        result = text_classification(
+            MsDataset.load(
+                'xcopa',
+                subset_name='translation-et',
+                namespace='damotest',
+                split='test',
+                target='premise'))
+        self.printDataset(result)
+
+    # @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_default_model(self):
+        text_classification = pipeline(task=Tasks.text_classification)
+        result = text_classification(
+            MsDataset.load(
+                'xcopa',
+                subset_name='translation-et',
+                namespace='damotest',
+                split='test',
+                target='premise'))
+        self.printDataset(result)
+
+    # @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skip('nlp model does not support tensor input, skipped')
+    def test_run_with_modelscope_dataset(self):
+        text_classification = pipeline(task=Tasks.text_classification)
+        # loaded from modelscope dataset
+        dataset = MsDataset.load(
+            'xcopa',
+            subset_name='translation-et',
+            namespace='damotest',
+            split='test',
+            target='premise')
+        result = text_classification(dataset)
+        self.printDataset(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py
index 57fa809c..0b43e8b4 100644
--- a/tests/pipelines/test_text_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -4,7 +4,7 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import TextRanking
+from modelscope.models.nlp import BertForTextRanking
 from modelscope.pipelines import pipeline
 from modelscope.pipelines.nlp import TextRankingPipeline
 from modelscope.preprocessors import TextRankingPreprocessor
@@ -33,7 +33,7 @@ class TextRankingTest(unittest.TestCase):
         for model_id in self.models:
             cache_path = snapshot_download(model_id)
             tokenizer = TextRankingPreprocessor(cache_path)
-            model = TextRanking.from_pretrained(cache_path)
+            model = BertForTextRanking.from_pretrained(cache_path)
             pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
             pipeline2 = pipeline(
                 Tasks.text_ranking, model=model, preprocessor=tokenizer)
diff --git a/tests/trainers/test_finetune_sequence_classification.py b/tests/trainers/test_finetune_sequence_classification.py
index aa8aba5c..ae780793 100644
--- a/tests/trainers/test_finetune_sequence_classification.py
+++ b/tests/trainers/test_finetune_sequence_classification.py
@@ -8,7 +8,7 @@ from modelscope.metainfo import Preprocessors, Trainers
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
-from modelscope.trainers import build_trainer
+from modelscope.trainers import NlpTrainerArguments, build_trainer
 from modelscope.trainers.hooks import Hook
 from modelscope.trainers.nlp_trainer import (EpochBasedTrainer,
                                              NlpEpochBasedTrainer)
@@ -38,6 +38,52 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_cfg_class(self):
+        dataset = MsDataset.load('clue', subset_name='tnews')
+        train_dataset = dataset['train']
+        validation_dataset = dataset['validation']
+        cfg_modify_fn = NlpTrainerArguments(
+            task=Tasks.text_classification,
+            preprocessor_type=Preprocessors.sen_cls_tokenizer,
+            train_first_sequence='sentence',
+            train_label='label',
+            labels=[
+                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+                '12', '13', '14'
+            ],
+            max_epochs=5,
+            optimizer_args={
+                'lr': 3e-5,
+            },
+            lr_scheduler_args={
+                'total_iters': int(len(train_dataset) / 32) * 5,
+            },
+            checkpoint_saving_type='BestCkptSaverHook',
+            metric_key='accuracy',
+            train_batch_size_per_gpu=32,
+            checkpoint_interval=1,
+            train_workers_per_gpu=0,
+            checkpoint_by_epoch=False,
+            evaluation_interval=1,
+            evaluation_by_epoch=False,
+            eval_workers_per_gpu=0,
+            metrics=['seq-cls-metric'],
+        )
+
+        kwargs = dict(
+            model='damo/nlp_structbert_backbone_base_std',
+            train_dataset=train_dataset,
+            eval_dataset=validation_dataset,
+            work_dir=self.tmp_dir,
+            seed=42,
+            cfg_modify_fn=cfg_modify_fn)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer: EpochBasedTrainer = build_trainer(
+            name=Trainers.nlp_base_trainer, default_args=kwargs)
+        trainer.train()
+
     @unittest.skip(
         'Skip testing trainer repeatable, because it\'s unstable in daily UT')
     def test_trainer_repeatable(self):
@@ -330,7 +376,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                 2,
                 'dataloader': {
                     'batch_size_per_gpu': 16,
-                    'workers_per_gpu': 1
+                    'workers_per_gpu': 0
                 },
                 'optimizer': {
                     'type': 'AdamW',
@@ -351,7 +397,6 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                 'hooks': [{
                     'type': 'CheckpointHook',
                     'interval': 1,
-                    'save_dir': '/root'
                 }, {
                     'type': 'TextLoggerHook',
                     'interval': 1
@@ -366,7 +411,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
             cfg['evaluation'] = {
                 'dataloader': {
                     'batch_size_per_gpu': 128,
-                    'workers_per_gpu': 1,
+                    'workers_per_gpu': 0,
                     'shuffle': False
                 }
             }
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 5b0c9982..9380ad0f 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -7,8 +7,7 @@ import unittest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Metrics
 from modelscope.models.base import Model
-from modelscope.models.nlp.sequence_classification import \
-    SbertForSequenceClassification
+from modelscope.models.nlp import SbertForSequenceClassification
 from modelscope.msdatasets import MsDataset
 from modelscope.pipelines import pipeline
 from modelscope.trainers import EpochBasedTrainer, build_trainer

From 6ddafb32183eacdec7414833f952c5c66153a869 Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 12:55:41 +0800
Subject: [PATCH 114/129] [to #42322933]caption finetune done, add belu metric 
        Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10318299

---
 modelscope/metainfo.py                        |   6 +
 modelscope/metrics/__init__.py                |   4 +
 modelscope/metrics/accuracy_metric.py         |  46 ++
 modelscope/metrics/bleu_metric.py             |  42 ++
 modelscope/metrics/builder.py                 |   1 +
 modelscope/metrics/ciderD/__init__.py         |   1 +
 modelscope/metrics/ciderD/ciderD.py           |  57 ++
 modelscope/metrics/ciderD/ciderD_scorer.py    | 233 +++++++
 .../multi_modal/ofa/adaptor/__init__.py       |   0
 .../models/multi_modal/ofa/generate/search.py |   2 +-
 .../ofa/generate/sequence_generator.py        |  32 +-
 .../models/multi_modal/ofa/modeling_ofa.py    |  10 +-
 .../models/multi_modal/ofa/utils/constant.py  |   4 +-
 .../models/multi_modal/ofa_for_all_tasks.py   |  97 ++-
 .../cv/image_classification_pipeline.py       |   2 +
 modelscope/preprocessors/multi_modal.py       |  55 +-
 modelscope/preprocessors/ofa/base.py          |  62 +-
 .../preprocessors/ofa/image_captioning.py     |  47 +-
 .../preprocessors/ofa/image_classification.py |  20 +-
 .../preprocessors/ofa/ocr_recognition.py      |  20 +-
 modelscope/preprocessors/ofa/summarization.py |  16 +-
 .../preprocessors/ofa/text_classification.py  |  63 +-
 .../ofa/text_to_image_synthesis.py            |  16 +-
 modelscope/preprocessors/ofa/utils/collate.py |   4 +
 .../preprocessors/ofa/utils/constant.py       |  13 +
 .../preprocessors/ofa/visual_entailment.py    |  21 +-
 .../preprocessors/ofa/visual_grounding.py     |  21 +-
 .../ofa/visual_question_answering.py          |  20 +-
 .../trainers/multi_modal/ofa/__init__.py      |   3 +
 .../trainers/multi_modal/ofa/ofa_trainer.py   | 154 ++++
 .../multi_modal/ofa/ofa_trainer_utils.py      | 243 +++++++
 modelscope/trainers/trainer.py                |  21 +-
 modelscope/trainers/utils/inference.py        |  37 +-
 modelscope/utils/constant.py                  |   1 +
 modelscope/utils/device.py                    |  16 +-
 modelscope/utils/multi_modal/fp16/__init__.py |  14 +
 modelscope/utils/multi_modal/fp16/fp16.py     | 655 ++++++++++++++++++
 modelscope/utils/multi_modal/fp16/fp16util.py | 216 ++++++
 .../utils/multi_modal/fp16/loss_scaler.py     | 237 +++++++
 requirements/multi-modal.txt                  |   1 +
 tests/trainers/test_ofa_trainer.py            | 105 +++
 41 files changed, 2461 insertions(+), 157 deletions(-)
 create mode 100644 modelscope/metrics/accuracy_metric.py
 create mode 100644 modelscope/metrics/bleu_metric.py
 create mode 100755 modelscope/metrics/ciderD/__init__.py
 create mode 100755 modelscope/metrics/ciderD/ciderD.py
 create mode 100755 modelscope/metrics/ciderD/ciderD_scorer.py
 create mode 100644 modelscope/models/multi_modal/ofa/adaptor/__init__.py
 create mode 100644 modelscope/preprocessors/ofa/utils/constant.py
 create mode 100644 modelscope/trainers/multi_modal/ofa/__init__.py
 create mode 100644 modelscope/trainers/multi_modal/ofa/ofa_trainer.py
 create mode 100644 modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
 create mode 100644 modelscope/utils/multi_modal/fp16/__init__.py
 create mode 100755 modelscope/utils/multi_modal/fp16/fp16.py
 create mode 100644 modelscope/utils/multi_modal/fp16/fp16util.py
 create mode 100755 modelscope/utils/multi_modal/fp16/loss_scaler.py
 create mode 100644 tests/trainers/test_ofa_trainer.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 01b08699..f77ff299 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -281,6 +281,7 @@ class Trainers(object):
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
+    ofa = 'ofa'
 
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
@@ -375,6 +376,9 @@ class Metrics(object):
     accuracy = 'accuracy'
     audio_noise_metric = 'audio-noise-metric'
 
+    # text gen
+    BLEU = 'bleu'
+
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
 
@@ -395,6 +399,8 @@ class Metrics(object):
     movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
     # metric for inpainting task
     image_inpainting_metric = 'image-inpainting-metric'
+    # metric for ocr
+    NED = 'ned'
 
 
 class Optimizers(object):
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index e6a03a22..c022eaf4 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -17,6 +17,8 @@ if TYPE_CHECKING:
     from .token_classification_metric import TokenClassificationMetric
     from .video_summarization_metric import VideoSummarizationMetric
     from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
+    from .accuracy_metric import AccuracyMetric
+    from .bleu_metric import BleuMetric
     from .image_inpainting_metric import ImageInpaintingMetric
 
 else:
@@ -36,6 +38,8 @@ else:
         'video_summarization_metric': ['VideoSummarizationMetric'],
         'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
         'image_inpainting_metric': ['ImageInpaintingMetric'],
+        'accuracy_metric': ['AccuracyMetric'],
+        'bleu_metric': ['BleuMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
new file mode 100644
index 00000000..1761786e
--- /dev/null
+++ b/modelscope/metrics/accuracy_metric.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(group_key=default_group, module_name=Metrics.accuracy)
+class AccuracyMetric(Metric):
+    """The metric computation class for classification classes.
+
+    This metric class calculates accuracy for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.preds = []
+        self.labels = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
+        ground_truths = inputs[label_name]
+        eval_results = outputs[label_name]
+        assert type(ground_truths) == type(eval_results)
+        if isinstance(ground_truths, list):
+            self.preds.extend(eval_results)
+            self.labels.extend(ground_truths)
+        elif isinstance(ground_truths, np.ndarray):
+            self.preds.extend(eval_results.tolist())
+            self.labels.extend(ground_truths.tolist())
+        else:
+            raise 'only support list or np.ndarray'
+
+    def evaluate(self):
+        assert len(self.preds) == len(self.labels)
+        return {
+            MetricKeys.ACCURACY: (np.asarray([
+                pred == ref for pred, ref in zip(self.preds, self.labels)
+            ])).mean().item()
+        }
diff --git a/modelscope/metrics/bleu_metric.py b/modelscope/metrics/bleu_metric.py
new file mode 100644
index 00000000..7c134b6a
--- /dev/null
+++ b/modelscope/metrics/bleu_metric.py
@@ -0,0 +1,42 @@
+from itertools import zip_longest
+from typing import Dict
+
+import sacrebleu
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+EVAL_BLEU_ORDER = 4
+
+
+@METRICS.register_module(group_key=default_group, module_name=Metrics.BLEU)
+class BleuMetric(Metric):
+    """The metric computation bleu for text generation classes.
+
+    This metric class calculates accuracy for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_tokenized_bleu = kwargs.get('eval_tokenized_bleu', False)
+        self.hyp_name = kwargs.get('hyp_name', 'hyp')
+        self.ref_name = kwargs.get('ref_name', 'ref')
+        self.refs = list()
+        self.hyps = list()
+
+    def add(self, outputs: Dict, inputs: Dict):
+        self.refs.extend(inputs[self.ref_name])
+        self.hyps.extend(outputs[self.hyp_name])
+
+    def evaluate(self):
+        if self.eval_tokenized_bleu:
+            bleu = sacrebleu.corpus_bleu(
+                self.hyps, list(zip_longest(*self.refs)), tokenize='none')
+        else:
+            bleu = sacrebleu.corpus_bleu(self.hyps,
+                                         list(zip_longest(*self.refs)))
+        return {
+            MetricKeys.BLEU_4: bleu.score,
+        }
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 1c8e16d7..da3b64c7 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -23,6 +23,7 @@ class MetricKeys(object):
     BLEU_4 = 'bleu-4'
     ROUGE_1 = 'rouge-1'
     ROUGE_L = 'rouge-l'
+    NED = 'ned'  # ocr metric
 
 
 task_default_metrics = {
diff --git a/modelscope/metrics/ciderD/__init__.py b/modelscope/metrics/ciderD/__init__.py
new file mode 100755
index 00000000..3f7d85bb
--- /dev/null
+++ b/modelscope/metrics/ciderD/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/modelscope/metrics/ciderD/ciderD.py b/modelscope/metrics/ciderD/ciderD.py
new file mode 100755
index 00000000..05c7eb23
--- /dev/null
+++ b/modelscope/metrics/ciderD/ciderD.py
@@ -0,0 +1,57 @@
+# Filename: ciderD.py
+#
+# Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
+#               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
+#
+# Creation Date: Sun Feb  8 14:16:54 2015
+#
+# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
+from __future__ import absolute_import, division, print_function
+
+from .ciderD_scorer import CiderScorer
+
+
+class CiderD:
+    """
+    Main Class to compute the CIDEr metric
+
+    """
+
+    def __init__(self, n=4, sigma=6.0, df='corpus'):
+        # set cider to sum over 1 to 4-grams
+        self._n = n
+        # set the standard deviation parameter for gaussian penalty
+        self._sigma = sigma
+        # set which where to compute document frequencies from
+        self._df = df
+        self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
+
+    def compute_score(self, gts, res):
+        """
+        Main function to compute CIDEr score
+        :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
+                ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
+        :return: cider (float) : computed CIDEr score for the corpus
+        """ # noqa
+
+        # clear all the previous hypos and refs
+        tmp_cider_scorer = self.cider_scorer.copy_empty()
+        tmp_cider_scorer.clear()
+        for res_id in res:
+
+            hypo = res_id['caption']
+            ref = gts[res_id['image_id']]
+
+            # Sanity check.
+            assert (type(hypo) is list)
+            assert (len(hypo) == 1)
+            assert (type(ref) is list)
+            assert (len(ref) > 0)
+            tmp_cider_scorer += (hypo[0], ref)
+
+        (score, scores) = tmp_cider_scorer.compute_score()
+
+        return score, scores
+
+    def method(self):
+        return 'CIDEr-D'
diff --git a/modelscope/metrics/ciderD/ciderD_scorer.py b/modelscope/metrics/ciderD/ciderD_scorer.py
new file mode 100755
index 00000000..4157ec11
--- /dev/null
+++ b/modelscope/metrics/ciderD/ciderD_scorer.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# Tsung-Yi Lin <tl483@cornell.edu>
+# Ramakrishna Vedantam <vrama91@vt.edu>
+from __future__ import absolute_import, division, print_function
+import copy
+import math
+import os
+import pdb
+from collections import defaultdict
+
+import numpy as np
+import six
+from six.moves import cPickle
+
+
+def precook(s, n=4, out=False):
+    """
+    Takes a string as input and returns an object that can be given to
+    either cook_refs or cook_test. This is optional: cook_refs and cook_test
+    can take string arguments as well.
+    :param s: string : sentence to be converted into ngrams
+    :param n: int    : number of ngrams for which representation is calculated
+    :return: term frequency vector for occuring ngrams
+    """
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i:i + k])
+            counts[ngram] += 1
+    return counts
+
+
+def cook_refs(refs, n=4):  # lhuang: oracle will call with "average"
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.
+    :param refs: list of string : reference sentences for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (list of dict)
+    '''
+    return [precook(ref, n) for ref in refs]
+
+
+def cook_test(test, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.
+    :param test: list of string : hypothesis sentence for some image
+    :param n: int : number of ngrams for which (ngram) representation is calculated
+    :return: result (dict)
+    '''
+    return precook(test, n, True)
+
+
+class CiderScorer(object):
+    """CIDEr scorer.
+    """
+
+    def copy(self):
+        ''' copy the refs.'''
+        new = CiderScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+
+    def copy_empty(self):
+        new = CiderScorer(df_mode='corpus', n=self.n, sigma=self.sigma)
+        new.df_mode = self.df_mode
+        new.ref_len = self.ref_len
+        new.document_frequency = self.document_frequency
+        return new
+
+    def __init__(self, df_mode='corpus', test=None, refs=None, n=4, sigma=6.0):
+        ''' singular instance '''
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.df_mode = df_mode
+        self.ref_len = None
+        if self.df_mode != 'corpus':
+            pkl_file = cPickle.load(
+                open(df_mode, 'rb'),
+                **(dict(encoding='latin1') if six.PY3 else {}))
+            self.ref_len = np.log(float(pkl_file['ref_len']))
+            self.document_frequency = pkl_file['document_frequency']
+        else:
+            self.document_frequency = None
+        self.cook_append(test, refs)
+
+    def clear(self):
+        self.crefs = []
+        self.ctest = []
+
+    def cook_append(self, test, refs):
+        '''called by constructor and __iadd__ to avoid creating new instances.'''
+
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test))  # N.B.: -1
+            else:
+                self.ctest.append(
+                    None)  # lens of crefs and ctest have to match
+
+    def size(self):
+        assert len(self.crefs) == len(
+            self.ctest), 'refs/test mismatch! %d<>%d' % (len(
+                self.crefs), len(self.ctest))
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        '''add an instance (e.g., from another sentence).'''
+
+        if type(other) is tuple:
+            # avoid creating new CiderScorer instances
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+
+        return self
+
+    def compute_doc_freq(self):
+        """
+        Compute term frequency for reference data.
+        This will be used to compute idf (inverse document frequency later)
+        The term frequency is stored in the object
+        :return: None
+        """
+        for refs in self.crefs:
+            # refs, k ref captions of one image
+            for ngram in set([
+                    ngram for ref in refs for (ngram, count) in ref.items()
+            ]):  # noqa
+                self.document_frequency[ngram] += 1
+
+    def compute_cider(self):
+
+        def counts2vec(cnts):
+            """
+            Function maps counts of ngram to vector of tfidf weights.
+            The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
+            The n-th entry of array denotes length of n-grams.
+            :param cnts:
+            :return: vec (array of dict), norm (array of float), length (int)
+            """
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram, term_freq) in cnts.items():
+                # give word count 1 if it doesn't appear in reference corpus
+                df = np.log(max(1.0, self.document_frequency[ngram]))
+                # ngram index
+                n = len(ngram) - 1
+                # tf (term_freq) * idf (precomputed idf) for n-grams
+                vec[n][ngram] = float(term_freq) * (self.ref_len - df)
+                # compute norm for the vector.  the norm will be used for computing similarity
+                norm[n] += pow(vec[n][ngram], 2)
+
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            '''
+            Compute the cosine similarity of two vectors.
+            :param vec_hyp: array of dictionary for vector corresponding to hypothesis
+            :param vec_ref: array of dictionary for vector corresponding to reference
+            :param norm_hyp: array of float for vector corresponding to hypothesis
+            :param norm_ref: array of float for vector corresponding to reference
+            :param length_hyp: int containing length of hypothesis
+            :param length_ref: int containing length of reference
+            :return: array of score for each n-grams cosine similarity
+            '''
+            delta = float(length_hyp - length_ref)
+            # measure consine similarity
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                # ngram
+                for (ngram, count) in vec_hyp[n].items():
+                    # vrama91 : added clipping
+                    val[n] += min(vec_hyp[n][ngram],
+                                  vec_ref[n][ngram]) * vec_ref[n][ngram]
+
+                if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
+                    val[n] /= (norm_hyp[n] * norm_ref[n])
+
+                assert (not math.isnan(val[n]))
+                # vrama91: added a length based gaussian penalty
+                val[n] *= np.e**(-(delta**2) / (2 * self.sigma**2))
+            return val
+
+        # compute log reference length
+        if self.df_mode == 'corpus':
+            self.ref_len = np.log(float(len(self.crefs)))
+        # elif self.df_mode == "coco-val-df":
+        # if coco option selected, use length of coco-val set
+        #    self.ref_len = np.log(float(40504))
+
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            # compute vector for test captions
+            vec, norm, length = counts2vec(test)
+            # compute vector for ref captions
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            # change by vrama91 - mean of ngram scores, instead of sum
+            score_avg = np.mean(score)
+            # divide by number of references
+            score_avg /= len(refs)
+            # multiply score by 10
+            score_avg *= 10.0
+            # append score of an image to the score list
+            scores.append(score_avg)
+        return scores
+
+    def compute_score(self, option=None, verbose=0):
+        # compute idf
+        if self.df_mode == 'corpus':
+            self.document_frequency = defaultdict(float)
+            self.compute_doc_freq()
+            # assert to check document frequency
+            assert (len(self.ctest) >= max(self.document_frequency.values()))
+            # import json for now and write the corresponding files
+        # compute cider score
+        score = self.compute_cider()
+        # debug
+        # print score
+        return np.mean(np.array(score)), np.array(score)
diff --git a/modelscope/models/multi_modal/ofa/adaptor/__init__.py b/modelscope/models/multi_modal/ofa/adaptor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/multi_modal/ofa/generate/search.py b/modelscope/models/multi_modal/ofa/generate/search.py
index 63ecb0a9..0dcaf6b3 100644
--- a/modelscope/models/multi_modal/ofa/generate/search.py
+++ b/modelscope/models/multi_modal/ofa/generate/search.py
@@ -148,7 +148,7 @@ class BeamSearch(Search):
         scores_buf = top_prediction[0]
         indices_buf = top_prediction[1]
         # Project back into relative indices and beams
-        beams_buf = indices_buf // vocab_size
+        beams_buf = torch.div(indices_buf, vocab_size, rounding_mode='floor')
         indices_buf = indices_buf.fmod(vocab_size)
 
         # At this point, beams_buf and indices_buf are single-dim and contain relative indices
diff --git a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
index 590fb67b..e42d3c8e 100644
--- a/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
+++ b/modelscope/models/multi_modal/ofa/generate/sequence_generator.py
@@ -385,12 +385,7 @@ class SequenceGenerator(nn.Module):
                     attn = torch.empty(bsz * beam_size,
                                        avg_attn_scores.size(1),
                                        max_len + 2).to(scores)
-                    # print("+++++++ debug attention shape +++++++")
-                    # print("attn", attn.shape)
-                    # print("avg_attn_scores", avg_attn_scores.shape)
                 attn[:, :, step + 1].copy_(avg_attn_scores)
-                # print("attn[:, :, step + 1]", attn[:, :, step + 1].shape)
-                # print("attn", attn.shape)
 
             scores = scores.type_as(lprobs)
             eos_bbsz_idx = torch.empty(0).to(
@@ -404,8 +399,28 @@ class SequenceGenerator(nn.Module):
                 self.search.set_src_lengths(src_lengths)
 
             if self.repeat_ngram_blocker is not None:
-                lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz,
-                                                   beam_size, step)
+                # process prefix_tokens
+                p_toks_len = prefix_tokens.ne(self.pad).sum(
+                    dim=1) if prefix_tokens is not None else None
+                if p_toks_len is not None:
+                    p_toks_len_beam = p_toks_len.unsqueeze(-1).repeat(
+                        1, beam_size).view(-1)
+                    no_repeat_ngram_size = self.repeat_ngram_blocker.no_repeat_ngram_size
+                    out_prefix = p_toks_len_beam < (
+                        step + no_repeat_ngram_size - 1)
+                else:
+                    out_prefix = torch.ones(bsz * beam_size).bool()
+                ngram_blocker_tokens = tokens[out_prefix]
+                ngram_blocker_lprobs = lprobs[out_prefix]
+                ngram_blocker_bsz = torch.div(
+                    out_prefix.sum(), beam_size, rounding_mode='trunc')
+
+                lprobs[out_prefix] = self.repeat_ngram_blocker(
+                    tokens=ngram_blocker_tokens,
+                    lprobs=ngram_blocker_lprobs,
+                    bsz=ngram_blocker_bsz,
+                    beam_size=beam_size,
+                    step=step)
 
             # Shape: (batch, cand_size)
             cand_scores, cand_indices, cand_beams = self.search.step(
@@ -415,7 +430,6 @@ class SequenceGenerator(nn.Module):
                 tokens[:, :step + 1],
                 original_batch_idxs,
             )
-
             # cand_bbsz_idx contains beam indices for the top candidate
             # hypotheses, with a range of values: [0, bsz*beam_size),
             # and dimensions: [bsz, cand_size]
@@ -671,7 +685,7 @@ class SequenceGenerator(nn.Module):
                 cum_unfin.append(prev)
         cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx)
 
-        unfin_idx = bbsz_idx // beam_size
+        unfin_idx = torch.div(bbsz_idx, beam_size, rounding_mode='floor')
         sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx)
 
         # Create a set of "{sent}{unfin_idx}", where
diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py
index 01cc02f9..0a7a2ce6 100755
--- a/modelscope/models/multi_modal/ofa/modeling_ofa.py
+++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py
@@ -19,6 +19,7 @@ from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
 import torch
+from packaging import version
 from torch import Tensor, nn
 from torch.nn import functional as F
 from transformers.activations import ACT2FN
@@ -40,6 +41,8 @@ logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = 'ofa-base'
 _CONFIG_FOR_DOC = 'OFAConfig'
 _TOKENIZER_FOR_DOC = 'OFATokenizer'
+TORCH_VERSION = version.parse(torch.__version__)
+TORCH_MESH_GRID_WARNING_VERSION = version.parse('1.9.1')
 
 DEFAULT_MAX_SOURCE_POSITIONS = 1024
 DEFAULT_MAX_TARGET_POSITIONS = 1024
@@ -51,6 +54,7 @@ OFA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     'ofa-medium',
     'ofa-base',
     'ofa-large',
+    'ofa-huge',
 ]
 
 try:
@@ -114,7 +118,11 @@ def make_image_bucket_position(bucket_size, num_relative_distance):
     """
     coords_h = torch.arange(bucket_size)
     coords_w = torch.arange(bucket_size)
-    coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+    if TORCH_VERSION > TORCH_MESH_GRID_WARNING_VERSION:
+        coords = torch.stack(
+            torch.meshgrid([coords_h, coords_w], indexing='ij'))  # 2, Wh, Ww
+    else:
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
     coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
     relative_coords = coords_flatten[:, :, None] - \
         coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
diff --git a/modelscope/models/multi_modal/ofa/utils/constant.py b/modelscope/models/multi_modal/ofa/utils/constant.py
index d3257383..b3776f8f 100644
--- a/modelscope/models/multi_modal/ofa/utils/constant.py
+++ b/modelscope/models/multi_modal/ofa/utils/constant.py
@@ -8,7 +8,7 @@ OFA_TASK_KEY_MAPPING = {
     Tasks.text_summarization: OutputKeys.TEXT,
     Tasks.visual_question_answering: OutputKeys.TEXT,
     Tasks.visual_grounding: OutputKeys.BOXES,
-    Tasks.text_classification: (OutputKeys.SCORES, OutputKeys.LABELS),
+    Tasks.text_classification: OutputKeys.LABELS,
     Tasks.image_classification: OutputKeys.LABELS,
-    Tasks.visual_entailment: (OutputKeys.SCORES, OutputKeys.LABELS),
+    Tasks.visual_entailment: OutputKeys.LABELS,
 }
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 6e331228..56d19ad8 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -1,8 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
+import os
 import string
+from functools import partial
 from os import path as osp
-from typing import Any, Dict
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import json
 import torch.cuda
@@ -10,7 +12,6 @@ import torch.nn.functional as F
 
 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
-from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.ofa.utils.collate import collate_tokens
@@ -66,10 +67,9 @@ class OfaForAllTasks(TorchModel):
         self.gen_type = self.cfg.model.get('gen_type', 'generation')
         assert self.gen_type in ['generation', 'traverse'], \
             'model.gen_type must be in ["generation", "traverse"]'
-        self._device = torch.device('cuda') if torch.cuda.is_available() \
-            else torch.device('cpu')
-        self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id
-                                          ]).to(self._device)
+        self.bos_item = torch.LongTensor([self.tokenizer.bos_token_id])
+        self.pad_item = torch.LongTensor([self.tokenizer.pad_token_id])
+        self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id])
         self.index2ans = {}
         self.ans2label_dict = {}
         self.load_ans2label()
@@ -90,7 +90,8 @@ class OfaForAllTasks(TorchModel):
             self.val_masks_l = []
             self.build_trie()
             sg_args['constraint_trie'] = self.constraint_trie
-        self.model.to(self._device)
+        else:
+            self.constraint_trie = None
         self.generator = sg.SequenceGenerator(**sg_args)
         inference_d = {
             'generation': self._text_gen_inference,
@@ -108,8 +109,16 @@ class OfaForAllTasks(TorchModel):
         }
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input = move_to_device(input, self.model.device)
+        if self.model.training:
+            return self.model(**input['net_input'])
+        else:
+            return self.inference(input)
+
+    def inference(self, input: Dict[str, Any]) -> Dict[str, Any]:
         ret = self.task_inference_mapping[self.cfg.task](input)
-        ret['samples'] = input['samples']
+        if 'samples' in input:
+            ret['samples'] = input['samples']
         for key in [
                 OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
                 OutputKeys.LABELS, OutputKeys.SCORES
@@ -118,21 +127,33 @@ class OfaForAllTasks(TorchModel):
                 ret[key] = None
         return ret
 
-    def postprocess(self, input: Dict[str, Tensor],
-                    **kwargs) -> Dict[str, Tensor]:
-        if self.cfg.task == Tasks.image_captioning:
-            caption = [
-                cap.translate(self.transtab).strip()
-                for cap in input[OutputKeys.CAPTION]
-            ]
-            input[OutputKeys.CAPTION] = caption
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        if not self.model.training and self.cfg.task == Tasks.image_captioning:
+            caption = input[OutputKeys.CAPTION]
+            result_l = list()
+            for cap in caption:
+                result_l.append(cap.translate(self.transtab).strip())
+            input[OutputKeys.CAPTION] = result_l
         return input
 
     def _text_gen_inference(self, input):
-        input = move_to_device(input, self._device)
-        gen_output = self.generator.generate([self.model], input)
-        gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
-        result = self.tokenizer.batch_decode(gen, skip_special_tokens=True)
+        gen_outputs = self.generator.generate([self.model],
+                                              input,
+                                              prefix_tokens=input.get(
+                                                  'prefix_tokens', None))
+        gen_l = list()
+        for idx, gen_out in enumerate(gen_outputs):
+            if len(gen_out) > 0:
+                decode_tokens = gen_out[0]['tokens']
+                if 'prefix_tokens' in input:
+                    prefix_len = input['prefix_tokens'][idx].ne(
+                        self.pad_item.to(self.model.device)).sum()
+                    decode_tokens = decode_tokens[prefix_len:]
+                gen_l.append(decode_tokens)
+            else:
+                gen_l.append('')
+        result = self.tokenizer.batch_decode(gen_l, skip_special_tokens=True)
+        result = [item.strip() for item in result]
         # text generation tasks have no score
         ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result}
         if self.cfg.task.endswith('classification'):
@@ -140,7 +161,6 @@ class OfaForAllTasks(TorchModel):
         return ret
 
     def _visual_grounding_inference(self, input):
-        input = move_to_device(input, self._device)
         gen_output = self.generator.generate([self.model], input)
         tokens = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
         region_coord_l = list()
@@ -160,7 +180,6 @@ class OfaForAllTasks(TorchModel):
         }
 
     def _traverse_inference(self, input):
-        input = move_to_device(input, self._device)
         encoder_input = dict()
         for key in input['net_input'].keys():
             encoder_input[key] = input['net_input'][key]
@@ -170,13 +189,14 @@ class OfaForAllTasks(TorchModel):
             valid_size = len(val_ans)
             valid_tgt_items = [
                 torch.cat([
-                    torch.tensor(decoder_prompt[1:]), valid_answer,
+                    torch.tensor(decoder_prompt[1:]).to('cpu'), valid_answer,
                     self.eos_item
                 ]) for decoder_prompt in input['decoder_prompts']
                 for valid_answer in val_ans
             ]
             valid_prev_items = [
-                torch.cat([torch.tensor(decoder_prompt), valid_answer])
+                torch.cat(
+                    [torch.tensor(decoder_prompt).to('cpu'), valid_answer])
                 for decoder_prompt in input['decoder_prompts']
                 for valid_answer in val_ans
             ]
@@ -184,19 +204,19 @@ class OfaForAllTasks(TorchModel):
                 torch.cat([
                     torch.zeros(
                         len(decoder_prompt) - 1,
-                        valid_constraint_mask.size(1)).bool().to(self._device),
+                        valid_constraint_mask.size(1)).bool(),
                     valid_constraint_mask], dim=0)  # yapf: disable
                 for decoder_prompt in input['decoder_prompts']  # yapf: disable
                 for valid_constraint_mask in val_masks]  # yapf: disable
             valid_tgt = collate_tokens(
                 valid_tgt_items,
-                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+                pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
             valid_prev_output = collate_tokens(
                 valid_prev_items,
-                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+                pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
             val_masks = collate_tokens(
                 valid_constraint_mask_items,
-                pad_idx=self.tokenizer.pad_token_id).to(self._device)
+                pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
             new_encoder_out = {
                 'last_hidden_state':
                 encoder_out['last_hidden_state'].repeat_interleave(
@@ -271,10 +291,23 @@ class OfaForAllTasks(TorchModel):
             self.val_masks_l += [
                 constraint_mask_list[i:i + self.val_batch_size]
             ]
-        self.val_ans_l = move_to_device(self.val_ans_l, self._device)
-        self.val_masks_l = move_to_device(self.val_masks_l, self._device)
 
     def load_ans2label(self):
         if self.cfg.model.get('answer2label', None):
-            filename = osp.join(self.model_dir, self.cfg.model.answer2label)
-            self.ans2label_dict = json.load(open(filename))
+            ans2label_file = osp.join(self.model_dir,
+                                      self.cfg.model.answer2label)
+            with open(ans2label_file, 'r') as reader:
+                self.ans2label_dict = json.load(reader)
+
+    def save_pretrained(self,
+                        target_folder: Union[str, os.PathLike],
+                        save_checkpoint_names: Union[str, List[str]] = None,
+                        save_function: Callable = None,
+                        config: Optional[dict] = None,
+                        **kwargs):
+        super(OfaForAllTasks, self). \
+            save_pretrained(target_folder=target_folder,
+                            save_checkpoint_names=save_checkpoint_names,
+                            save_function=partial(save_function, with_meta=False),
+                            config=config,
+                            **kwargs)
diff --git a/modelscope/pipelines/cv/image_classification_pipeline.py b/modelscope/pipelines/cv/image_classification_pipeline.py
index 49467eab..69dbd1fb 100644
--- a/modelscope/pipelines/cv/image_classification_pipeline.py
+++ b/modelscope/pipelines/cv/image_classification_pipeline.py
@@ -13,6 +13,7 @@ from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor, load_image
 from modelscope.utils.constant import Tasks
+from modelscope.utils.device import get_device
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -36,6 +37,7 @@ class ImageClassificationPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
+        pipe_model.to(get_device())
         if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
             preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 4427c096..256c5243 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
+from io import BytesIO
 from typing import Any, Dict, List, Tuple, Union
 
 import torch
@@ -15,6 +16,7 @@ from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .ofa import *  # noqa
 from .ofa.utils.collate import collate_fn
+from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
 
 __all__ = [
     'OfaPreprocessor',
@@ -26,11 +28,16 @@ __all__ = [
     Fields.multi_modal, module_name=Preprocessors.ofa_tasks_preprocessor)
 class OfaPreprocessor(Preprocessor):
 
-    def __init__(self, model_dir: str, *args, **kwargs):
+    def __init__(self,
+                 model_dir: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             model_dir (str): model path
+            mode: preprocessor mode (model mode)
         """
         super().__init__(*args, **kwargs)
         preprocess_mapping = {
@@ -45,25 +52,18 @@ class OfaPreprocessor(Preprocessor):
             Tasks.text_summarization: OfaSummarizationPreprocessor,
             Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
         }
-        input_key_mapping = {
-            Tasks.ocr_recognition: ['image'],
-            Tasks.image_captioning: ['image'],
-            Tasks.image_classification: ['image'],
-            Tasks.text_summarization: ['text'],
-            Tasks.text_classification: ['text', 'text2'],
-            Tasks.visual_grounding: ['image', 'text'],
-            Tasks.visual_question_answering: ['image', 'text'],
-            Tasks.visual_entailment: ['image', 'text', 'text2'],
-            Tasks.text_to_image_synthesis: ['text']
-        }
         model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
             model_dir)
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
-        self.preprocess = preprocess_mapping[self.cfg.task](self.cfg,
-                                                            model_dir)
-        self.keys = input_key_mapping[self.cfg.task]
+        self.preprocess = preprocess_mapping[self.cfg.task](
+            cfg=self.cfg, model_dir=model_dir, mode=mode)
+        self.keys = OFA_TASK_KEY_MAPPING[self.cfg.task]
         self.tokenizer = self.preprocess.tokenizer
+        if kwargs.get('no_collate', None):
+            self.no_collate = True
+        else:
+            self.no_collate = False
 
     # just for modelscope demo
     def _build_dict(self, input: Union[Input, List[Input]]) -> Dict[str, Any]:
@@ -74,20 +74,37 @@ class OfaPreprocessor(Preprocessor):
             data[key] = item
         return data
 
+    def _ofa_input_compatibility_conversion(self, data):
+        if 'image' in data and self.cfg.model.get('type', None) == 'ofa':
+            if isinstance(data['image'], str):
+                image = load_image(data['image'])
+            else:
+                image = data['image']
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+            img_buffer = BytesIO()
+            image.save(img_buffer, format='JPEG')
+            data['image'] = Image.open(img_buffer)
+        return data
+
     def __call__(self, input: Union[str, tuple, Dict[str, Any]], *args,
                  **kwargs) -> Dict[str, Any]:
         if isinstance(input, dict):
             data = input
         else:
             data = self._build_dict(input)
+        data = self._ofa_input_compatibility_conversion(data)
         sample = self.preprocess(data)
         str_data = dict()
         for k, v in data.items():
             str_data[k] = str(v)
         sample['sample'] = str_data
-        return collate_fn([sample],
-                          pad_idx=self.tokenizer.pad_token_id,
-                          eos_idx=self.tokenizer.eos_token_id)
+        if self.no_collate:
+            return sample
+        else:
+            return collate_fn([sample],
+                              pad_idx=self.tokenizer.pad_token_id,
+                              eos_idx=self.tokenizer.eos_token_id)
 
 
 @PREPROCESSORS.register_module(
@@ -140,7 +157,7 @@ class MPlugPreprocessor(Preprocessor):
     def image_open(self, path: str) -> Tuple[Image.Image, int]:
         if path not in self._image_map:
             index = len(self._image_map)
-            self._image_map[path] = (load_image(path), index)
+            self._image_map[path] = (Image.open(path), index)
         return self._image_map[path]
 
     def __call__(
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index 691f8b36..55b3895d 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -1,26 +1,31 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
+import string
 from os import path as osp
 
 import json
 import numpy as np
 import torch
+from PIL import Image
 
 from modelscope.models.multi_modal.ofa import OFATokenizer, OFATokenizerZH
+from modelscope.preprocessors.image import load_image
 from modelscope.utils.trie import Trie
+from .utils.constant import OFA_TASK_KEY_MAPPING
 from .utils.random_help import set_torch_seed
 
 
 class OfaBasePreprocessor:
 
-    def __init__(self, cfg, model_dir):
-        """preprocess the data
+    def __init__(self, cfg, model_dir, mode, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
             model_dir (str): model path
         """
         self.cfg = cfg
+        self.mode = mode
         self.language = self.cfg.model.get('language', 'en')
         if self.language == 'en':
             tokenizer = OFATokenizer.from_pretrained(model_dir)
@@ -41,6 +46,7 @@ class OfaBasePreprocessor:
             for key, value in tokenizer.get_vocab().items()
         }
         self.max_src_length = cfg.model.get('max_src_length', 256)
+        self.max_tgt_length = cfg.model.get('max_tgt_length', 256)
         self.max_image_size = cfg.model.get('max_image_size', 512)
         self.language = self.cfg.model.get('language', 'en')
         self.prompt_type = self.cfg.model.get('prompt_type', 'none')
@@ -56,26 +62,40 @@ class OfaBasePreprocessor:
             self.mean = [0.5, 0.5, 0.5]
             self.std = [0.5, 0.5, 0.5]
         self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
+        self.column_map = {
+            key: key
+            for key in OFA_TASK_KEY_MAPPING[self.cfg.task]
+        }
+        if hasattr(self.cfg,
+                   'dataset') and self.cfg.dataset.column_map is not None:
+            for k, v in self.cfg.dataset.column_map.items():
+                self.column_map[k] = v
+        self.transtab = str.maketrans(
+            {key: None
+             for key in string.punctuation})
         self.constraint_trie = None
-        self.index2ans = {}
-        if self.cfg.model.get('answer2label', False):
+        if self.cfg.model.get('answer2label', None):
             ans2label_file = osp.join(model_dir, self.cfg.model.answer2label)
-            ans2label_dict = json.load(open(ans2label_file, 'r'))
+            with open(ans2label_file, 'r') as reader:
+                ans2label_dict = json.load(reader)
+            self.ans2label = ans2label_dict
+            self.label2ans = {v: k for k, v in self.ans2label.items()}
             self.constraint_trie = Trie(tokenizer.eos_token_id)
             for i, answer in enumerate(ans2label_dict.keys()):
-                answer_item = tokenizer(
-                    ' ' + answer,
-                    return_tensors='pt',
-                    add_special_tokens=False).input_ids.squeeze(0)
+                answer_item = self.tokenize_text(
+                    ' ' + answer, add_bos=False, add_eos=False)
                 self.constraint_trie.insert([tokenizer.bos_token_id]
                                             + answer_item.tolist()
                                             + [tokenizer.eos_token_id])
 
-    def get_inputs(self, text, add_bos=True, add_eos=True):
+    def tokenize_text(self, text, add_bos=True, add_eos=True):
+        if text is None:
+            return None
         inputs = self.tokenizer(
             text,
             max_length=self.max_src_length,
             add_special_tokens=False,
+            truncation=True,
             return_tensors='pt')['input_ids'].squeeze(0)
         if add_bos:
             inputs = torch.cat([self.bos_item, inputs])
@@ -85,7 +105,7 @@ class OfaBasePreprocessor:
 
     @staticmethod
     def pre_caption(caption, max_words=None):
-        caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ')\
+        caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ') \
             .replace('/', ' ').replace('<person>', 'person')
 
         caption = re.sub(
@@ -123,3 +143,23 @@ class OfaBasePreprocessor:
             question = ' '.join(question_words[:max_ques_words])
 
         return question
+
+    def add_constraint_mask(self, sample):
+        target_itm = sample['target']
+        len_label_itm = target_itm.ne(self.pad_item).sum(dim=0).item()
+        if self.constraint_trie:
+            constraint_mask = torch.zeros(
+                (len(target_itm), len(self.tgt_dict))).bool()
+            start_idx = len(target_itm) - len_label_itm
+            for i in range(start_idx, len(target_itm)):
+                constraint_prefix_token = self.bos_item.tolist(
+                ) + target_itm[start_idx:i].tolist()
+                constraint_nodes = self.constraint_trie.get_next_layer(
+                    constraint_prefix_token)
+                constraint_mask[i][constraint_nodes] = True
+            sample['constraint_mask'] = constraint_mask
+
+    def get_img_pil(self, path_or_url_or_pil):
+        image = path_or_url_or_pil if isinstance(path_or_url_or_pil, Image.Image) \
+            else load_image(path_or_url_or_pil)
+        return image
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 318a8a6d..af623297 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -1,42 +1,67 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 import torch
-from PIL import Image
 from torchvision import transforms
 
-from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaImageCaptioningPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaImageCaptioningPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        image = data['image'] if isinstance(
-            data['image'], Image.Image) else load_image(data['image'])
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        sample = self._build_infer_sample(data)
+        target = data[self.column_map['text']]
+        target = target.translate(self.transtab).strip()
+        target_token_list = target.strip().split()
+        target = ' '.join(target_token_list[:self.max_tgt_length])
+        sample['target'] = self.tokenize_text(target, add_bos=False)
+        sample['prev_output_tokens'] = torch.cat(
+            [self.bos_item, sample['target'][:-1]])
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
-        inputs = self.get_inputs(prompt)
+        inputs = self.tokenize_text(prompt)
         sample = {
             'source': inputs,
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True])
         }
+        if 'text' in self.column_map and self.column_map['text'] in data:
+            sample['label'] = data[self.column_map['text']]
         return sample
diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index dd2de634..49968823 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -6,25 +6,33 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
         super(OfaImageClassificationPreprocessor,
-              self).__init__(cfg, model_dir)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
@@ -34,7 +42,7 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
-        inputs = self.get_inputs(prompt)
+        inputs = self.tokenize_text(prompt)
         sample = {
             'source': inputs,
             'patch_image': patch_image,
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index 1d30e572..1761dbd4 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -1,7 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import random
-import unicodedata
-from typing import Any, Dict, Union
+from typing import Any, Dict
 
 import torch
 from PIL import Image
@@ -10,6 +8,7 @@ from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as F
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
@@ -59,14 +58,21 @@ def ocr_resize(img, patch_image_size, is_document=False):
 
 class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaOcrRecognitionPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaOcrRecognitionPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         if self.cfg.model.imagenet_default_mean_and_std:
             mean = IMAGENET_DEFAULT_MEAN
@@ -89,7 +95,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
-        inputs = self.get_inputs(prompt)
+        inputs = self.tokenize_text(prompt)
 
         sample = {
             'source': inputs,
diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index 99028e61..cfd3c23d 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -1,19 +1,27 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaSummarizationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaSummarizationPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaSummarizationPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         source = super().pre_caption(
@@ -23,7 +31,7 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
         prompt = self.cfg.model.get(
             'prompt', ' " {} " Summarize the article with a title: ')
         text = prompt.format(source)
-        inputs = self.get_inputs(text)
+        inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
         elif self.prompt_type == 'prev_output':
diff --git a/modelscope/preprocessors/ofa/text_classification.py b/modelscope/preprocessors/ofa/text_classification.py
index 5673a07f..24c4f67e 100644
--- a/modelscope/preprocessors/ofa/text_classification.py
+++ b/modelscope/preprocessors/ofa/text_classification.py
@@ -1,38 +1,81 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
+import torch
+
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaTextClassificationPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaTextClassificationPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_instruction(self, data):
         text1 = ' '.join(
             data['text'].lower().strip().split()[:self.max_src_length])
         text2 = ' '.join(
             data['text2'].lower().strip().split()[:self.max_src_length])
         prompt = ' can text1 " {} " imply text2 " {} "?'
         text = prompt.format(text1, text2)
-        inputs = self.get_inputs(text)
+        instruction_itm = self.tokenize_text(text)
+        return instruction_itm
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        instruction_itm = self._build_instruction(data)
+        assert 'label' in data, 'there must has `label` column in train phase '
+        label = data['label']
+        if self.label2ans:
+            label = self.label2ans[label]  # ans
+        label_itm = self.tokenize_text(f' {label}', add_bos=False)
+        if self.prompt_type == 'none':
+            target_itm = label_itm
+        elif self.prompt_type == 'prev_output':
+            target_itm = torch.cat([instruction_itm[1:-1], label_itm])
+        else:
+            raise NotImplementedError
+        prev_output_itm = torch.cat([self.bos_item, target_itm[:-1]])
+        target_itm[:-len(label_itm)] = self.pad_item
+        sample = {
+            'source': instruction_itm,
+            'target': target_itm,
+            'prev_output_tokens': prev_output_itm,
+        }
+        self.add_constraint_mask(sample)
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        instruction_itm = self._build_instruction(data)
         if self.prompt_type == 'none':
-            decoder_prompt = self.bos_item
-        elif self.prompt_type == 'src':
-            decoder_prompt = inputs
+            prefix_token = []
         elif self.prompt_type == 'prev_output':
-            decoder_prompt = inputs[:-1]
+            prefix_token = instruction_itm[:-1]  # remove eos
         else:
             raise NotImplementedError
         sample = {
-            'source': inputs,
-            'decoder_prompt': decoder_prompt,
+            'source': instruction_itm,
+            'prefix_token': prefix_token,
         }
+        if 'label' in data:
+            sample['label'] = self.label2ans[data['label']]
         return sample
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index e10de82c..2f6000eb 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -3,26 +3,34 @@ from typing import Any, Dict
 
 import torch
 
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
-            model_dir (str): model path
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
         super(OfaTextToImageSynthesisPreprocessor,
-              self).__init__(cfg, model_dir)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         source = ' '.join(
             data['text'].lower().strip().split()[:self.max_src_length])
         source = 'what is the complete image? caption: {}'.format(source)
-        inputs = self.get_inputs(source)
+        inputs = self.tokenize_text(source)
         sample = {
             'source': inputs,
             'patch_images': None,
diff --git a/modelscope/preprocessors/ofa/utils/collate.py b/modelscope/preprocessors/ofa/utils/collate.py
index 4bfa8eca..f7775680 100644
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -49,11 +49,15 @@ def collate_fn(samples, pad_idx, eos_idx):
         batch['conf'] = torch.cat([s['conf'] for s in samples], dim=0)
     if samples[0].get('ref_dict', None) is not None:
         batch['ref_dict'] = np.array([s['ref_dict'] for s in samples])
+    if samples[0].get('label', None) is not None:
+        batch['labels'] = np.array([s['label'] for s in samples]).tolist()
     if samples[0].get('constraint_mask', None) is not None:
         batch['constraint_masks'] = merge('constraint_mask')
     if samples[0].get('decoder_prompt', None) is not None:
         batch['decoder_prompts'] = np.array(
             [s['decoder_prompt'].tolist() for s in samples])
+    if samples[0].get('prefix_token', None) is not None:
+        batch['prefix_tokens'] = merge('prefix_token')
     # For detection and visual grounding
     if samples[0].get('w_resize_ratio', None) is not None:
         batch['w_resize_ratios'] = torch.stack(
diff --git a/modelscope/preprocessors/ofa/utils/constant.py b/modelscope/preprocessors/ofa/utils/constant.py
new file mode 100644
index 00000000..102d27c0
--- /dev/null
+++ b/modelscope/preprocessors/ofa/utils/constant.py
@@ -0,0 +1,13 @@
+from modelscope.utils.constant import Tasks
+
+OFA_TASK_KEY_MAPPING = {
+    Tasks.ocr_recognition: ['image'],
+    Tasks.image_captioning: ['image'],
+    Tasks.image_classification: ['image'],
+    Tasks.text_summarization: ['text'],
+    Tasks.text_classification: ['text', 'text2'],
+    Tasks.visual_grounding: ['image', 'text'],
+    Tasks.visual_question_answering: ['image', 'text'],
+    Tasks.visual_entailment: ['image', 'text', 'text2'],
+    Tasks.text_to_image_synthesis: ['text']
+}
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index 6002c4a6..61c3cc6a 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -6,24 +6,33 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaVisualEntailmentPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaVisualEntailmentPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
@@ -44,7 +53,7 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
             prompt = self.cfg.model.get(
                 'prompt', ' can image and text1 " {} " imply text2 " {} "?')
             text = prompt.format(caption, hypothesis)
-        inputs = self.get_inputs(text)
+        inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
         elif self.prompt_type == 'src':
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index 022e5788..8b116463 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -6,24 +6,33 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
-        super(OfaVisualGroundingPreprocessor, self).__init__(cfg, model_dir)
+        super(OfaVisualGroundingPreprocessor,
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
@@ -39,7 +48,7 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
         prompt = self.cfg.model.get(
             'prompt', ' which region does the text " {} " describe?')
         text = prompt.format(src_caption)
-        src_item = self.get_inputs(text)
+        src_item = self.tokenize_text(text)
         sample = {
             'source': src_item,
             'patch_image': patch_image,
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index d34d1db0..11104e7e 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -6,25 +6,33 @@ from PIL import Image
 from torchvision import transforms
 
 from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import ModeKeys
 from .base import OfaBasePreprocessor
 
 
 class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
 
-    def __init__(self, cfg, model_dir):
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
         """preprocess the data
 
         Args:
             cfg(modelscope.utils.config.ConfigDict) : model config
-            model_dir (str): model path
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
         """
         super(OfaVisualQuestionAnsweringPreprocessor,
-              self).__init__(cfg, model_dir)
+              self).__init__(cfg, model_dir, mode, *args, **kwargs)
         # Initialize transform
         self.patch_resize_transform = transforms.Compose([
             lambda image: image.convert('RGB'),
-            transforms.Resize((self.patch_image_size, self.patch_image_size),
-                              interpolation=Image.BICUBIC),
+            transforms.Resize(
+                (self.patch_image_size, self.patch_image_size),
+                interpolation=transforms.InterpolationMode.BICUBIC),
             transforms.ToTensor(),
             transforms.Normalize(mean=self.mean, std=self.std),
         ])
@@ -34,7 +42,7 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
             data['image'], Image.Image) else load_image(data['image'])
         patch_image = self.patch_resize_transform(image)
         text = ' {}'.format(data['text'])
-        inputs = self.get_inputs(text)
+        inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
             decoder_prompt = self.bos_item
         elif self.prompt_type == 'src':
diff --git a/modelscope/trainers/multi_modal/ofa/__init__.py b/modelscope/trainers/multi_modal/ofa/__init__.py
new file mode 100644
index 00000000..34e4ec7a
--- /dev/null
+++ b/modelscope/trainers/multi_modal/ofa/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .ofa_trainer import OFATrainer
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
new file mode 100644
index 00000000..02853925
--- /dev/null
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -0,0 +1,154 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+import os
+import shutil
+from functools import partial
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import torch
+from torch import distributed as dist
+from torch import nn
+from torch.utils.data import Dataset
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.multi_modal import OfaPreprocessor
+from modelscope.preprocessors.ofa.utils.collate import collate_fn
+from modelscope.trainers import EpochBasedTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.optimizer.builder import build_optimizer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys,
+                                       ModeKeys)
+from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
+                                get_schedule)
+
+
+@TRAINERS.register_module(module_name=Trainers.ofa)
+class OFATrainer(EpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Union[Preprocessor,
+                                         Dict[str, Preprocessor]]] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
+            **kwargs):
+        model = Model.from_pretrained(model, revision=model_revision)
+        model_dir = model.model_dir
+        cfg = Config.from_file(cfg_file)
+        if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0:
+            work_dir = cfg.train.work_dir
+        else:
+            work_dir = kwargs['work_dir']
+        tokenizer_files = {
+            'zh': [
+                'tokenizer.json', 'tokenizer_config.json', 'vocab.txt',
+                'config.json'
+            ],
+            'en':
+            ['tokenizer.json', 'vocab.json', 'merges.txt', 'config.json'],
+        }
+        for filename in tokenizer_files[cfg.model.get('language', 'en')]:
+            finetune_file = os.path.join(work_dir, filename)
+            pretrain_file = os.path.join(model_dir, filename)
+            if os.path.exists(finetune_file):
+                continue
+            if os.path.exists(pretrain_file):
+                shutil.copy(pretrain_file, finetune_file)
+
+        if preprocessor is None:
+            preprocessor = {
+                ConfigKeys.train:
+                OfaPreprocessor(
+                    model_dir=work_dir, mode=ModeKeys.TRAIN, no_collate=True),
+                ConfigKeys.val:
+                OfaPreprocessor(
+                    model_dir=work_dir, mode=ModeKeys.EVAL, no_collate=True),
+            }
+        # use torchrun launch
+        world_size = int(os.environ.get('WORLD_SIZE', 1))
+        epoch_steps = math.ceil(
+            len(train_dataset) /  # noqa
+            (cfg.train.dataloader.batch_size_per_gpu * world_size))  # noqa
+        cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
+        cfg.train.criterion.tokenizer = model.tokenizer
+        self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
+            cfg.train.criterion)
+        if optimizers[0] is None:
+            optimizer = build_optimizer(model, cfg=cfg.train.optimizer)
+        else:
+            optimizer = optimizers[0]
+        if optimizers[1] is None:
+            scheduler_class, scheduler_args = get_schedule(
+                cfg.train.lr_scheduler)
+            if scheduler_class is not None:
+                lr_scheduler = scheduler_class(**{'optimizer': optimizer},
+                                               **scheduler_args)
+            else:
+                lr_scheduler = None
+        else:
+            lr_scheduler = optimizers[1]
+        optimizers = (optimizer, lr_scheduler)
+        if data_collator is None:
+            data_collator = partial(
+                collate_fn,
+                pad_idx=model.tokenizer.pad_token_id,
+                eos_idx=model.tokenizer.eos_token_id,
+            )
+        if 'launcher' not in kwargs and cfg.train.get('launcher', None):
+            kwargs['launcher'] = cfg.train.launcher
+        if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False):
+            kwargs['use_fp16'] = cfg.train.use_fp16
+        kwargs['to_tensor'] = False
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            seed=seed,
+            **kwargs,
+        )
+
+    def train_step(self, model, inputs):
+        model.train()
+        model_outputs = model.forward(inputs)
+        loss, sample_size, logging_output = self.criterion(
+            model_outputs, inputs)
+        train_outputs = {'loss': loss}
+        # add model output info to log
+        if 'log_vars' not in train_outputs:
+            default_keys_pattern = ['loss']
+            match_keys = set([])
+            for key_p in default_keys_pattern:
+                match_keys.update(
+                    [key for key in train_outputs.keys() if key_p in key])
+            log_vars = {}
+            for key in match_keys:
+                value = train_outputs.get(key, None)
+                if value is not None:
+                    if dist.is_available() and dist.is_initialized():
+                        value = value.data.clone()
+                        dist.all_reduce(value.div_(dist.get_world_size()))
+                    log_vars.update({key: value.item()})
+            self.log_buffer.update(log_vars)
+        else:
+            self.log_buffer.update(train_outputs['log_vars'])
+        self.train_outputs = train_outputs
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
new file mode 100644
index 00000000..2189a5db
--- /dev/null
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -0,0 +1,243 @@
+# Copyright 2022 The OFA-Sys Team.
+# All rights reserved.
+# This source code is licensed under the Apache 2.0 license
+# found in the LICENSE file in the root directory.
+import math
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from torch.nn.modules.loss import _Loss
+
+
+def construct_rdrop_sample(x):
+    if isinstance(x, dict):
+        for key in x:
+            x[key] = construct_rdrop_sample(x[key])
+        return x
+    elif isinstance(x, torch.Tensor):
+        return x.repeat(2, *([1] * (x.dim() - 1)))
+    elif isinstance(x, int):
+        return x * 2
+    elif isinstance(x, np.ndarray):
+        return x.repeat(2)
+    else:
+        raise NotImplementedError
+
+
+def kl_loss(p, q):
+    p_loss = F.kl_div(p, torch.exp(q), reduction='sum')
+    q_loss = F.kl_div(q, torch.exp(p), reduction='sum')
+    loss = (p_loss + q_loss) / 2
+    return loss
+
+
+def label_smoothed_nll_loss(lprobs,
+                            target,
+                            epsilon,
+                            update_num,
+                            reduce=True,
+                            drop_worst_ratio=0.0,
+                            drop_worst_after=0,
+                            use_rdrop=False,
+                            reg_alpha=1.0,
+                            constraint_masks=None,
+                            constraint_start=None,
+                            constraint_end=None):
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target).squeeze(-1)
+    if constraint_masks is not None:
+        smooth_loss = -lprobs.masked_fill(~constraint_masks, 0).sum(
+            dim=-1, keepdim=True).squeeze(-1)
+        eps_i = epsilon / (constraint_masks.sum(1) - 1 + 1e-6)
+    elif constraint_start is not None and constraint_end is not None:
+        constraint_range = [0, 1, 2, 3] + list(
+            range(constraint_start, constraint_end))
+        smooth_loss = -lprobs[:, constraint_range].sum(
+            dim=-1, keepdim=True).squeeze(-1)
+        eps_i = epsilon / (len(constraint_range) - 1 + 1e-6)
+    else:
+        smooth_loss = -lprobs.sum(dim=-1, keepdim=True).squeeze(-1)
+        eps_i = epsilon / (lprobs.size(-1) - 1)
+    loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
+    if drop_worst_ratio > 0 and update_num > drop_worst_after:
+        if use_rdrop:
+            true_batch_size = loss.size(0) // 2
+            _, indices = torch.topk(
+                loss[:true_batch_size],
+                k=int(true_batch_size * (1 - drop_worst_ratio)),
+                largest=False)
+            loss = torch.cat([loss[indices], loss[indices + true_batch_size]])
+            nll_loss = torch.cat(
+                [nll_loss[indices], nll_loss[indices + true_batch_size]])
+            lprobs = torch.cat(
+                [lprobs[indices], lprobs[indices + true_batch_size]])
+        else:
+            loss, indices = torch.topk(
+                loss,
+                k=int(loss.shape[0] * (1 - drop_worst_ratio)),
+                largest=False)
+            nll_loss = nll_loss[indices]
+            lprobs = lprobs[indices]
+
+    ntokens = loss.numel()
+    nll_loss = nll_loss.sum() / ntokens  # 后面在grads里面处理
+    loss = loss.sum() / ntokens  # 后面在grads里面处理
+    if use_rdrop:
+        true_batch_size = lprobs.size(0) // 2
+        p = lprobs[:true_batch_size]
+        q = lprobs[true_batch_size:]
+        if constraint_start is not None and constraint_end is not None:
+            constraint_range = [0, 1, 2, 3] + list(
+                range(constraint_start, constraint_end))
+            p = p[:, constraint_range]
+            q = q[:, constraint_range]
+        loss += kl_loss(p, q) * reg_alpha
+
+    return loss, nll_loss, ntokens
+
+
+class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
+
+    def __init__(self, args):
+        super().__init__()
+        self.sentence_avg = args.sentence_avg
+        self.eps = args.label_smoothing
+        self.ignore_prefix_size = args.ignore_prefix_size
+        self.ignore_eos = args.ignore_eos
+        self.report_accuracy = args.report_accuracy
+        self.drop_worst_ratio = args.drop_worst_ratio
+        self.drop_worst_after = args.drop_worst_after
+        self.use_rdrop = args.use_rdrop
+        self.reg_alpha = args.reg_alpha
+        self.sample_patch_num = args.sample_patch_num
+
+        self.constraint_start = None
+        self.constraint_end = None
+        if args.constraint_range:
+            constraint_start, constraint_end = args.constraint_range.split(',')
+            self.constraint_start = int(constraint_start)
+            self.constraint_end = int(constraint_end)
+        self.padding_idx = args.tokenizer.pad_token_id
+        self.args = args
+
+    def forward(self, output, sample, update_num=0, reduce=True):
+        """Compute the loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        if self.use_rdrop:
+            construct_rdrop_sample(sample)
+
+        loss, nll_loss, ntokens = self.compute_loss(
+            output, sample, update_num, reduce=reduce)
+        sample_size = (
+            sample['target'].size(0) if self.sentence_avg else ntokens)
+        logging_output = {
+            'loss': loss.data,
+            'nll_loss': nll_loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample['nsentences'],
+            'sample_size': sample_size,
+        }
+        return loss, sample_size, logging_output
+
+    def get_lprobs_and_target(self, net_output, sample):
+        conf = sample['conf'][:, None, None] if 'conf' in sample and sample[
+            'conf'] is not None else 1
+        constraint_masks = None
+        if 'constraint_masks' in sample and sample[
+                'constraint_masks'] is not None:
+            constraint_masks = sample['constraint_masks']
+            net_output[0].masked_fill_(~constraint_masks, -math.inf)
+        if self.constraint_start is not None and self.constraint_end is not None:
+            net_output[0][:, :, 4:self.constraint_start] = -math.inf
+            net_output[0][:, :, self.constraint_end:] = -math.inf
+        lprobs = F.log_softmax(
+            net_output[0], dim=-1, dtype=torch.float32) * conf
+        target = sample['target']
+        if self.ignore_prefix_size > 0:
+            lprobs = lprobs[:, self.ignore_prefix_size:, :].contiguous()
+            target = target[:, self.ignore_prefix_size:].contiguous()
+            if constraint_masks is not None:
+                constraint_masks = constraint_masks[:, self.ignore_prefix_size:, :].contiguous()  # yapf: disable
+        if self.ignore_eos:
+            bsz, seq_len, embed_dim = lprobs.size()
+            eos_indices = target.eq(self.task.tgt_dict.eos())
+            lprobs = lprobs[~eos_indices].reshape(bsz, seq_len - 1, embed_dim)
+            target = target[~eos_indices].reshape(bsz, seq_len - 1)
+            if constraint_masks is not None:
+                constraint_masks = constraint_masks[~eos_indices].reshape(
+                    bsz, seq_len - 1, embed_dim)
+        if constraint_masks is not None:
+            constraint_masks = constraint_masks.view(-1,
+                                                     constraint_masks.size(-1))
+        return lprobs.view(-1,
+                           lprobs.size(-1)), target.view(-1), constraint_masks
+
+    def compute_loss(self, net_output, sample, update_num, reduce=True):
+        lprobs, target, constraint_masks = self.get_lprobs_and_target(
+            net_output, sample)
+        if constraint_masks is not None:
+            constraint_masks = constraint_masks[target != self.padding_idx]
+        lprobs = lprobs[target != self.padding_idx]
+        target = target[target != self.padding_idx]
+        loss, nll_loss, ntokens = label_smoothed_nll_loss(
+            lprobs,
+            target,
+            self.eps,
+            update_num,
+            reduce=reduce,
+            drop_worst_ratio=self.drop_worst_ratio,
+            drop_worst_after=self.drop_worst_after,
+            use_rdrop=self.use_rdrop,
+            reg_alpha=self.reg_alpha,
+            constraint_masks=constraint_masks,
+            constraint_start=self.constraint_start,
+            constraint_end=self.constraint_end)
+        return loss, nll_loss, ntokens
+
+
+def get_schedule(scheduler):
+
+    if scheduler.name == 'const':
+        scheduler_class = transformers.get_constant_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(scheduler.warmup_proportion * scheduler.num_train_steps)
+        }
+    elif scheduler.name == 'linear':
+        scheduler_class = transformers.get_linear_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(scheduler.warmup_proportion * scheduler.num_train_steps),
+            'num_training_steps':
+            scheduler.num_train_steps
+        }
+    elif scheduler.name == 'cosine':
+        scheduler_class = transformers.get_cosine_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(scheduler.warmup_proportion * scheduler.num_train_steps),
+            'num_training_steps':
+            scheduler.num_train_steps
+        }
+    elif scheduler.name == 'polynomial_decay':
+        scheduler_class = transformers.get_polynomial_decay_schedule_with_warmup
+        scheduler_args = {
+            'num_warmup_steps':
+            int(scheduler.warmup_proportion * scheduler.num_train_steps),
+            'num_training_steps':
+            scheduler.num_train_steps,
+            'lr_end':
+            scheduler.lr_end
+        }
+    else:
+        raise NotImplementedError
+
+    return scheduler_class, scheduler_args
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index 0dc6ece4..f47bff10 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -168,19 +168,20 @@ class EpochBasedTrainer(BaseTrainer):
             device_name = f'cuda:{local_rank}'
 
         self.device = create_device(device_name)
-
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
             task_data_config=self.cfg.dataset.get('train', None) if hasattr(
                 self.cfg, 'dataset') else None,
-            preprocessor=self.train_preprocessor)
+            preprocessor=self.train_preprocessor,
+            **kwargs)
         self.eval_dataset = self.to_task_dataset(
             eval_dataset,
             mode=ModeKeys.EVAL,
             task_data_config=self.cfg.dataset.get('val', None) if hasattr(
                 self.cfg, 'dataset') else None,
-            preprocessor=self.eval_preprocessor)
+            preprocessor=self.eval_preprocessor,
+            **kwargs)
 
         self.train_data_collator, self.eval_default_collate = None, None
         if isinstance(data_collator, Mapping):
@@ -216,7 +217,6 @@ class EpochBasedTrainer(BaseTrainer):
             self._max_epochs = self.cfg.train.max_epochs
         else:
             self._max_epochs = kwargs['max_epochs']
-
         self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
         self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
         if self._train_iters_per_epoch is None and hasattr(
@@ -306,13 +306,15 @@ class EpochBasedTrainer(BaseTrainer):
                         datasets: Union[Dataset, List[Dataset]],
                         mode: str,
                         task_data_config: Config = None,
-                        preprocessor: Optional[Preprocessor] = None):
+                        preprocessor: Optional[Preprocessor] = None,
+                        **kwargs):
         """Build the task specific dataset processor for this trainer.
 
         Returns: The task dataset processor for the task. If no result for the very model-type and task,
         the default TaskDataset will be returned.
         """
         try:
+            to_tensor = kwargs.get('to_tensor', True)
             if not datasets:
                 return datasets
             if isinstance(datasets, TorchTaskDataset):
@@ -328,7 +330,8 @@ class EpochBasedTrainer(BaseTrainer):
                 return datasets.to_torch_dataset(
                     task_data_config=task_data_config,
                     task_name=self.cfg.task,
-                    preprocessors=preprocessor)
+                    preprocessors=preprocessor,
+                    to_tensor=to_tensor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
                 if task_data_config is None:
@@ -342,7 +345,8 @@ class EpochBasedTrainer(BaseTrainer):
                     d.to_torch_dataset(
                         task_data_config=task_data_config,
                         task_name=self.cfg.task,
-                        preprocessors=preprocessor) for d in datasets
+                        preprocessors=preprocessor,
+                        to_tensor=to_tensor) for d in datasets
                 ]
                 cfg = ConfigDict(
                     type=self.cfg.model.type, mode=mode, datasets=datasets)
@@ -497,6 +501,7 @@ class EpochBasedTrainer(BaseTrainer):
         dp_cfg = dict(
             type='DistributedDataParallel',
             module=model,
+            find_unused_parameters=True,
             device_ids=[torch.cuda.current_device()])
 
         return build_parallel(dp_cfg)
@@ -779,7 +784,7 @@ class EpochBasedTrainer(BaseTrainer):
             batch_size = batch_size_per_gpu
             num_workers = workers_per_gpu
 
-        if dist:
+        if dist and not isinstance(dataset, torch.utils.data.IterableDataset):
             sampler = DistributedSampler(
                 dataset, num_replicas=world_size, rank=rank, shuffle=shuffle)
         else:
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index 7f5d4ec3..1f8f8ed0 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -69,7 +69,10 @@ def single_gpu_test(model,
                 batch_size = 1  # iteration count
             else:
                 if isinstance(data, dict):
-                    batch_size = len(next(iter(data.values())))
+                    if 'nsentences' in data:
+                        batch_size = data['nsentences']
+                    else:
+                        batch_size = len(next(iter(data.values())))
                 else:
                     batch_size = len(data)
             for _ in range(batch_size):
@@ -152,21 +155,29 @@ def multi_gpu_test(model,
                     result = model.forward(data)
             results.append(result)
 
-            if rank == 0:
-                if isinstance(data, dict):
-                    batch_size = len(next(iter(data.values())))
+            if isinstance(data, dict):
+                if 'nsentences' in data:
+                    batch_size = data['nsentences']
                 else:
-                    batch_size = len(data)
-
-                if progress_with_iters:
-                    total_samples += batch_size * world_size
-                    batch_size = 1  # iteration count
+                    batch_size = len(next(iter(data.values())))
+            else:
+                batch_size = len(data)
+            if i >= (data_len // world_size) - 1:
+                total_samples = torch.LongTensor([batch_size]).to(model.device)
+                dist.all_reduce(total_samples, op=dist.reduce_op.SUM)
+                total_samples = total_samples.item()
+            else:
+                total_samples = batch_size * world_size
+            if progress_with_iters:
+                iter_cnt_all = world_size
+            else:
+                iter_cnt_all = total_samples
+                count += iter_cnt_all
 
-                batch_size_all = batch_size * world_size
-                count += batch_size_all
+            if rank == 0:
                 if count > data_len:
-                    batch_size_all = data_len - (count - batch_size_all)
-                for _ in range(batch_size_all):
+                    iter_cnt_all = data_len - (count - iter_cnt_all)
+                for _ in range(iter_cnt_all):
                     pbar.update()
 
             if progress_with_iters and (i + 1) >= data_len:
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 6a9d6fd5..86b7bb7d 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -280,6 +280,7 @@ class ConfigKeys(object):
     """Fixed keywords in configuration file"""
     train = 'train'
     val = 'val'
+    test = 'test'
 
 
 class Requirements(object):
diff --git a/modelscope/utils/device.py b/modelscope/utils/device.py
index 6fc59e37..83faa261 100644
--- a/modelscope/utils/device.py
+++ b/modelscope/utils/device.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os
 from contextlib import contextmanager
 
 from modelscope.utils.constant import Devices, Frameworks
@@ -106,3 +106,17 @@ def create_device(device_name):
         device = torch.device('cpu')
 
     return device
+
+
+def get_device():
+    import torch
+    from torch import distributed as dist
+    if torch.cuda.is_available():
+        if dist.is_available() and dist.is_initialized(
+        ) and 'LOCAL_RANK' in os.environ:
+            device_id = f"cuda:{os.environ['LOCAL_RANK']}"
+        else:
+            device_id = 'cuda:0'
+    else:
+        device_id = 'cpu'
+    return torch.device(device_id)
diff --git a/modelscope/utils/multi_modal/fp16/__init__.py b/modelscope/utils/multi_modal/fp16/__init__.py
new file mode 100644
index 00000000..81250858
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .fp16 import FP16_Module, FP16_Optimizer
diff --git a/modelscope/utils/multi_modal/fp16/fp16.py b/modelscope/utils/multi_modal/fp16/fp16.py
new file mode 100755
index 00000000..37a80e65
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/fp16.py
@@ -0,0 +1,655 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Stable version of apex FP16 Optimizer"""
+import torch
+from torch import nn
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from .fp16util import (master_params_to_model_params,
+                       model_grads_to_master_grads)
+from .loss_scaler import DynamicLossScaler, LossScaler
+
+FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, FLOAT_TYPES):
+            val = val.half()
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, HALF_TYPES):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class FP16_Module(nn.Module):
+
+    def __init__(self, module):
+        super(FP16_Module, self).__init__()
+        self.add_module('module', module.half())
+
+    def forward(self, *inputs, **kwargs):
+        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
+
+
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
+    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
+    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
+    and changing the call to ``backward``.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+
+    Args:
+        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  # noqa
+        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.  # noqa
+        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option. # noqa
+        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. # noqa
+        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. # noqa
+
+    ``init_optimizer`` is expected to have been constructed in the ordinary way.
+    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
+    named to replace ``init_optimizer``, for two reasons:
+    First, it means that references to the same name
+    later in the file will not have to change.
+    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
+    modify ``init_optimizer``.  If you do choose a unique name for the new
+    :class:`FP16_Optimizer` instance, you should only work with this new instance,
+    because the preexisting optimizer might no longer behave as expected.
+
+    ``init_optimizer`` may be any Pytorch optimizer.
+    It may contain a mixture of fp16 and fp32 parameters organized into any number of
+    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
+    ingest these ``param_groups`` and remember them.
+
+    Calls to ::
+
+        loss.backward()
+
+    must be replaced with ::
+
+        optimizer.backward(loss)
+
+    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
+    loss scaling and copies to master gradients.
+
+    .. note::
+        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
+        are downscaled before being applied.  This means that adjusting the loss scale, or using
+        dynamic loss scaling, should not require retuning the learning rate or any other
+        hyperparameters.
+
+
+    **Advanced options**
+
+    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
+    See docstring for :attr:`step`.
+
+    **Gradient clipping**:  Use :attr:`clip_master_grads`.
+
+    **Multiple losses**:  If your model accumulates gradients from multiple losses,
+    this can be made more efficient by supplying ``update_master_grads=False``
+    to :attr:`backward`.  See docstring for :attr:`backward`.
+
+    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
+
+        print(optimizer.loss_scale)
+        optimizer.loss_scale = new_loss_scale
+
+    For static loss scaling, manually adjusting the loss scale over time is a reasonable
+    thing to do.  During later epochs, gradients may become smaller, and a
+    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
+    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
+    the loss scale is not recommended.
+
+    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
+    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
+    should still work as intended.
+    """
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=False):
+        if not torch.cuda.is_available:
+            raise SystemError('Cannot use fp16 without CUDA.')
+
+        self.verbose = verbose
+
+        self.optimizer = init_optimizer
+        # init_state_dict sets up an alternative way to cast per-param state tensors.
+        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
+        # init_state_dict = init_optimizer.state_dict()
+
+        self.fp16_groups = []
+        self.fp32_from_fp16_groups = []
+        self.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            self.maybe_print(
+                'FP16_Optimizer processing param group {}:'.format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
+                            .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        # Copythe model parallel flag.
+                        master_param.model_parallel = param.model_parallel
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[
+                                master_param] = self.optimizer.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        self.maybe_print(
+                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
+                            .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError(
+                            'Wrapped parameters must be either '
+                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
+                            'Received {}'.format(param.type()))
+
+            self.fp16_groups.append(fp16_params_this_group)
+            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+        # alternative way to cast per-param state tensors:
+        # self.optimizer.load_state_dict(init_state_dict)
+
+        if dynamic_loss_scale:
+            self.dynamic_loss_scale = True
+            if dynamic_loss_args is not None:
+                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
+            else:
+                self.loss_scaler = DynamicLossScaler()
+        else:
+            self.dynamic_loss_scale = False
+            self.loss_scaler = LossScaler(static_loss_scale)
+
+        self.overflow = False
+        self.first_closure_call_this_step = True
+
+        self.clip_grad_norm = nn.utils.clip_grad.clip_grad_norm_
+
+    def maybe_print(self, msg):
+        if self.verbose:
+            print(msg)
+
+    def __getstate__(self):
+        raise RuntimeError(
+            'FP16_Optimizer should be serialized using state_dict().')
+
+    def __setstate__(self, state):
+        raise RuntimeError(
+            'FP16_Optimizer should be deserialized using load_state_dict().')
+
+    def zero_grad(self, set_grads_to_None=False):
+        """
+        Zero fp32 and fp16 parameter grads.
+        """
+        # In principle, only the .grad attributes of the model params need to be zeroed,
+        # because gradients are copied into the FP32 master params.  However, we zero
+        # all gradients owned by the optimizer, just to be safe:
+        for group in self.optimizer.param_groups:
+            for p in group['params']:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+        # Zero fp16 gradients owned by the model:
+        for fp16_group in self.fp16_groups:
+            for param in fp16_group:
+                if set_grads_to_None:
+                    param.grad = None
+                else:
+                    if param.grad is not None:
+                        param.grad.detach_(
+                        )  # as in torch.optim.optimizer.zero_grad()
+                        param.grad.zero_()
+
+    def _check_overflow(self):
+        params = []
+        for group in self.fp16_groups:
+            for param in group:
+                params.append(param)
+        for group in self.fp32_from_fp32_groups:
+            for param in group:
+                params.append(param)
+        self.overflow = self.loss_scaler.has_overflow(params)
+
+    def _update_scale(self, has_overflow=False):
+        self.loss_scaler.update_scale(has_overflow)
+
+    def _master_params_to_model_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+    def _model_params_to_master_params(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
+
+    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
+    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
+    def _model_grads_to_master_grads(self):
+        for fp16_group, fp32_from_fp16_group in zip(
+                self.fp16_groups, self.fp32_from_fp16_groups):
+            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
+
+    def _downscale_master(self):
+        if self.loss_scale != 1.0:
+            for group in self.optimizer.param_groups:
+                for param in group['params']:
+                    if param.grad is not None:
+                        param.grad.data.mul_(1. / self.loss_scale)
+
+    def clip_master_grads(self, max_norm, norm_type=2):
+        """
+        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp32 gradients (viewed as a single vector).
+
+        .. warning::
+            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). # noqa
+        """
+        if not self.overflow:
+            fp32_params = []
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    fp32_params.append(param)
+            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
+        else:
+            return -1
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['loss_scaler'] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[
+            'first_closure_call_this_step'] = self.first_closure_call_this_step
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+
+        Example::
+
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.loss_scaler = state_dict['loss_scaler']
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.overflow = state_dict['overflow']
+        self.first_closure_call_this_step = state_dict[
+            'first_closure_call_this_step']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
+                                              state_dict['fp32_from_fp16']):
+            for current, saved in zip(current_group, saved_group):
+                current.data.copy_(saved.data)
+
+    def step(self, closure=None):  # could add clip option.
+        """
+        If no closure is supplied, :attr:`step` should be called after
+        ``fp16_optimizer_obj.backward(loss)``.
+        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
+        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
+        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
+        another forward pass using their model.
+
+        If a closure is supplied, :attr:`step` may be called without a prior call to
+        :attr:`backward(loss)`.
+        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
+        However, the user should take care that any ``loss.backward()`` call within the closure
+        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
+
+        Args:
+           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. # noqa
+
+        Example with closure::
+
+            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
+            # existing pytorch optimizer.
+            for input, target in dataset:
+                def closure():
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                    # loss.backward() becomes:
+                    optimizer.backward(loss)
+                    return loss
+                optimizer.step(closure)
+
+        .. warning::
+            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
+
+        .. _`ordinary Pytorch optimizer use`:
+            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
+        """
+
+        scale = self.loss_scaler.loss_scale
+        self._update_scale(self.overflow)
+
+        if self.overflow:
+            self.maybe_print(
+                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
+                .format(scale, self.loss_scale))
+            return
+
+        if closure is not None:
+            retval = self._step_with_closure(closure)
+        else:
+            retval = self.optimizer.step()
+
+        self._master_params_to_model_params()
+
+        return retval
+
+    def _step_with_closure(self, closure):
+
+        def wrapped_closure():
+            # helpful for debugging
+            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
+            #       .format(self.first_closure_call_this_step))
+            if self.first_closure_call_this_step:
+                # We expect that the fp16 params are initially fresh on entering self.step(),
+                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
+                # is called within self.optimizer.step().
+                self.first_closure_call_this_step = False
+            else:
+                # If self.optimizer.step() internally calls wrapped_closure more than once,
+                # it may update the fp32 params after each call.  However, self.optimizer
+                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
+                # we can't rely on self.optimizer to refresh the fp16 params.  We need
+                # to handle that manually:
+                self._master_params_to_model_params()
+            # Our API expects the user to give us ownership of the backward() call by
+            # replacing all calls to loss.backward() with optimizer.backward(loss).
+            # This requirement holds whether or not the call to backward() is made within a closure.
+            # If the user is properly calling optimizer.backward(loss) within "closure,"
+            # calling closure() here will give the fp32 master params fresh gradients
+            # for the optimizer to play with, so all wrapped_closure needs to do is call
+            # closure() and return the loss.
+            temp_loss = closure()
+            while (self.overflow):
+                scale = self.loss_scaler.loss_scale
+                self._update_scale(self.overflow)
+                self.maybe_print(
+                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
+                    'reducing to {}'.format(scale, self.loss_scale))
+                temp_loss = closure()
+            return temp_loss
+
+        retval = self.optimizer.step(wrapped_closure)
+
+        self.first_closure_call_this_step = True
+
+        return retval
+
+    def backward(self, loss, update_master_grads=True, retain_graph=False):
+        """
+        :attr:`backward` performs the following conceptual steps:
+
+        1. fp32_loss = loss.float() (see first Note below)
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). # noqa
+        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. # noqa
+        5. Finally, master grads are divided by loss_scale.
+
+        In this way, after :attr:`backward`, the master params have fresh gradients,
+        and :attr:`step` may be called.
+
+        .. note::
+            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
+            This provides some additional safety against overflow if the user has supplied an
+            fp16 loss value.
+            However, for maximum overflow safety, the user should
+            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
+            :attr:`backward`.
+
+        .. warning::
+            The gradients found in a model's leaves after the call to
+            :attr:`backward` should not be regarded as valid in general,
+            because it's possible
+            they have been scaled (and in the case of dynamic loss scaling,
+            the scale factor may change over time).
+            If the user wants to inspect gradients after a call to :attr:`backward`,
+            only the master gradients should be regarded as valid.  These can be retrieved via
+            :attr:`inspect_master_grad_data()`.
+
+        Args:
+            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
+            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. # noqa
+            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). # noqa
+
+        Example::
+
+            # Ordinary operation:
+            optimizer.backward(loss)
+
+            # Naive operation with multiple losses (technically valid, but less efficient):
+            # fp32 grads will be correct after the second call,  but
+            # the first call incurs an unnecessary fp16->fp32 grad copy.
+            optimizer.backward(loss1)
+            optimizer.backward(loss2)
+
+            # More efficient way to handle multiple losses:
+            # The fp16->fp32 grad copy is delayed until fp16 grads from all
+            # losses have been accumulated.
+            optimizer.backward(loss1, update_master_grads=False)
+            optimizer.backward(loss2, update_master_grads=False)
+            optimizer.update_master_grads()
+        """
+        # To consider:  try multiple backward passes using retain_grad=True to find
+        # a loss scale that works.  After you find a loss scale that works, do a final dummy
+        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
+        # discarding the iteration,  but probably wouldn't improve overall efficiency.
+        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
+        if update_master_grads:
+            self.update_master_grads()
+
+    def update_master_grads(self):
+        """
+        Copy the ``.grad`` attribute from stored references to fp16 parameters to
+        the ``.grad`` attribute of the fp32 master parameters that are directly
+        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
+        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
+        """
+        if self.dynamic_loss_scale:
+            self._check_overflow()
+            if self.overflow: return  # noqa
+        self._model_grads_to_master_grads()
+        self._downscale_master()
+
+    def inspect_master_grad_data(self):
+        """
+        When running with :class:`FP16_Optimizer`,
+        ``.grad`` attributes of a model's fp16 leaves should not be
+        regarded as truthful, because they might be scaled.
+        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
+        the fp32 master params' ``.grad``
+        attributes will contain valid gradients properly divided by the loss scale.  However,
+        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
+        nonintuitive.  :attr:`inspect_master_grad_data`
+        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
+
+        Returns:
+            List of lists (one list for each parameter group).  The list for each parameter group
+            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
+        """
+        if self.overflow:
+            print(
+                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
+                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
+            )
+            return None
+        else:
+            # The optimizer owns only references to master params.
+            master_grads_data = []
+            for param_group in self.optimizer.param_groups:
+                master_grads_this_group = []
+                for param in param_group['params']:
+                    if param.grad is not None:
+                        master_grads_this_group.append(param.grad.data)
+                    else:
+                        master_grads_this_group.append(None)
+                master_grads_data.append(master_grads_this_group)
+            return master_grads_data
+
+    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
+    def _get_loss_scale(self):
+        return self.loss_scaler.loss_scale
+
+    def _set_loss_scale(self, value):
+        self.loss_scaler.cur_scale = value
+
+    loss_scale = property(_get_loss_scale, _set_loss_scale)
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/utils/multi_modal/fp16/fp16util.py b/modelscope/utils/multi_modal/fp16/fp16util.py
new file mode 100644
index 00000000..29595a6c
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/fp16util.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.autograd import Variable
+
+
+class tofp16(nn.Module):
+    """
+    Utility module that implements::
+
+        def forward(self, input):
+            return input.half()
+    """
+
+    def __init__(self):
+        super(tofp16, self).__init__()
+
+    def forward(self, input):
+        return input.half()
+
+
+def BN_convert_float(module):
+    """
+    Utility function for network_to_half().
+
+    Retained for legacy purposes.
+    """
+    if isinstance(
+            module,
+            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
+        module.float()
+    for child in module.children():
+        BN_convert_float(child)
+    return module
+
+
+def network_to_half(network):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+
+    Retained for legacy purposes. It is recommended to use FP16Model.
+    """
+    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
+
+
+def convert_module(module, dtype):
+    """
+    Converts a module's immediate parameters and buffers to dtype.
+    """
+    for param in module.parameters(recurse=False):
+        if param is not None:
+            if param.data.dtype.is_floating_point:
+                param.data = param.data.to(dtype=dtype)
+            if param._grad is not None and param._grad.data.dtype.is_floating_point:
+                param._grad.data = param._grad.data.to(dtype=dtype)
+
+    for buf in module.buffers(recurse=False):
+        if buf is not None and buf.data.dtype.is_floating_point:
+            buf.data = buf.data.to(dtype=dtype)
+
+
+def convert_network(network, dtype):
+    """
+    Converts a network's parameters and buffers to dtype.
+    """
+    for module in network.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
+                      ) and module.affine is True:
+            continue
+        convert_module(module, dtype)
+    return network
+
+
+class FP16Model(nn.Module):
+    """
+    Convert model to half precision in a batchnorm-safe way.
+    """
+
+    def __init__(self, network):
+        super(FP16Model, self).__init__()
+        self.network = convert_network(network, dtype=torch.half)
+
+    def forward(self, *inputs):
+        inputs = tuple(t.half() for t in inputs)
+        return self.network(*inputs)
+
+
+def backwards_debug_hook(grad):
+    raise RuntimeError(
+        'master_params recieved a gradient in the backward pass!')
+
+
+def prep_param_lists(model, flat_master=False):
+    """
+    Creates a list of FP32 master parameters for a given model, as in
+    `Training Neural Networks with Mixed Precision:  Real Examples`_.
+
+    Args:
+        model (torch.nn.Module): Existing Pytorch model
+        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.  # noqa
+    Returns:
+        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element. # noqa
+
+    Example::
+
+        model_params, master_params = prep_param_lists(model)
+
+    .. warning::
+        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. # noqa
+
+    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
+        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
+    """
+    model_params = [
+        param for param in model.parameters() if param.requires_grad
+    ]
+
+    if flat_master:
+        # Give the user some more useful error messages
+        try:
+            # flatten_dense_tensors returns a contiguous flat array.
+            # http://pytorch.org/docs/master/_modules/torch/_utils.html
+            master_params = _flatten_dense_tensors(
+                [param.data for param in model_params]).float()
+        except:  # noqa
+            print(
+                'Error in prep_param_lists:  model may contain a mixture of parameters '
+                'of different types.  Use flat_master=False, or use F16_Optimizer.'
+            )
+            raise
+        master_params = torch.nn.Parameter(master_params)
+        master_params.requires_grad = True
+        # master_params.register_hook(backwards_debug_hook)
+        if master_params.grad is None:
+            master_params.grad = master_params.new(*master_params.size())
+        return model_params, [master_params]
+    else:
+        master_params = [
+            param.clone().float().detach() for param in model_params
+        ]
+        for param in master_params:
+            param.requires_grad = True
+        return model_params, master_params
+
+
+def model_grads_to_master_grads(model_params,
+                                master_params,
+                                flat_master=False):
+    """
+    Copy model gradients to master gradients.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. # noqa
+    """
+    if flat_master:
+        # The flattening may incur one more deep copy than is necessary.
+        master_params[0].grad.data.copy_(
+            _flatten_dense_tensors([p.grad.data for p in model_params]))
+    else:
+        for model, master in zip(model_params, master_params):
+            if model.grad is not None:
+                if master.grad is None:
+                    master.grad = Variable(
+                        master.data.new(*master.data.size()))
+                master.grad.data.copy_(model.grad.data)
+            else:
+                master.grad = None
+
+
+def master_params_to_model_params(model_params,
+                                  master_params,
+                                  flat_master=False):
+    """
+    Copy master parameters to model parameters.
+
+    Args:
+        model_params:  List of model parameters created by :func:`prep_param_lists`.
+        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. # noqa
+    """
+    if flat_master:
+        for model, master in zip(
+                model_params,
+                _unflatten_dense_tensors(master_params[0].data, model_params)):
+            model.data.copy_(master)
+    else:
+        for model, master in zip(model_params, master_params):
+            model.data.copy_(master.data)
+
+
+# Backward compatibility fixes
+
+
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
diff --git a/modelscope/utils/multi_modal/fp16/loss_scaler.py b/modelscope/utils/multi_modal/fp16/loss_scaler.py
new file mode 100755
index 00000000..fc55a4ed
--- /dev/null
+++ b/modelscope/utils/multi_modal/fp16/loss_scaler.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+# item() is a recent addition, so this helps with backward compatibility.
+def to_python_float(t):
+    if hasattr(t, 'item'):
+        return t.item()
+    else:
+        return t[0]
+
+
+class LossScaler:
+    """
+    Class that manages a static loss scale.  This class is intended to interact with
+    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
+
+    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
+    :class:`FP16_Optimizer`'s constructor.
+
+    Args:
+        scale (float, optional, default=1.0):  The loss scale.
+    """
+
+    def __init__(self, scale=1):
+        self.cur_scale = scale
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow(self, params):
+        return False
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        return False
+
+    def update_scale(self, overflow):
+        pass
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+class DynamicLossScaler:
+    """
+    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
+    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
+    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
+    operates, because the default options can be changed using the
+    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
+
+    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
+    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
+    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
+    occurred.
+    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
+    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients detected,
+    :class:`DynamicLossScaler` increases the loss scale once more.
+    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
+    always using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.  # noqa
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.  # noqa
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 scale_factor=2.,
+                 scale_window=1000,
+                 min_scale=1,
+                 delayed_shift=1,
+                 consecutive_hysteresis=False):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+        self.min_scale = min_scale
+        self.delayed_shift = delayed_shift
+        self.cur_hysteresis = delayed_shift
+        self.consecutive_hysteresis = consecutive_hysteresis
+
+    # `params` is a list / generator of torch.Variable
+    def has_overflow_serial(self, params):
+        for p in params:
+            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
+                    p.grad.data):
+                return True
+
+        return False
+
+    def has_overflow(self, params):
+        overflow = self.has_overflow_serial(params)
+        overflow_gpu = torch.cuda.ByteTensor([overflow])
+        overflow = overflow_gpu[0].item()
+        return bool(overflow)
+
+    # `x` is a torch.Tensor
+    def _has_inf_or_nan(x):
+        try:
+            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
+            # Pytorch's .sum() creates a one-element tensor of the same type as x
+            # (which is true for some recent version of pytorch).
+            cpu_sum = float(x.float().sum())
+            # More efficient version that can be used if .sum() returns a Python scalar
+            # cpu_sum = float(x.sum())
+        except RuntimeError as instance:
+            # We want to check if inst is actually an overflow exception.
+            # RuntimeError could come from a different error.
+            # If so, we still want the exception to propagate.
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float(
+                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    # `overflow` is boolean indicating whether the gradient overflowed
+    def update_scale(self, overflow):
+
+        if not hasattr(self, 'min_scale'):
+            self.min_scale = 1
+        if not hasattr(self, 'delayed_shift'):
+            self.delayed_shift = 1
+        if not hasattr(self, 'cur_hysteresis'):
+            self.cur_hysteresis = 1
+        if not hasattr(self, 'consecutive_hysteresis'):
+            self.consecutive_hysteresis = True
+        if overflow:
+            # self.cur_scale /= self.scale_factor
+            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
+                self.cur_scale = max(self.cur_scale / self.scale_factor,
+                                     self.min_scale)
+            else:
+                self.cur_hysteresis -= 1
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if self.consecutive_hysteresis:
+                self.cur_hysteresis = self.delayed_shift
+            if (self.cur_iter
+                    - self.last_overflow_iter) % self.scale_window == 0:
+                if not self.consecutive_hysteresis:
+                    self.cur_hysteresis = self.delayed_shift
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
+
+    def scale_gradient(self, module, grad_in, grad_out):
+        return tuple(self.loss_scale * g for g in grad_in)
+
+    def backward(self, loss, retain_graph=False):
+        scaled_loss = loss * self.loss_scale
+        scaled_loss.backward(retain_graph=retain_graph)
+
+
+##############################################################
+# Example usage below here -- assuming it's in a separate file
+##############################################################
+"""
+TO-DO separate out into an example.
+if __name__ == "__main__":
+    import torch
+    from torch.autograd import Variable
+    from dynamic_loss_scaler import DynamicLossScaler
+
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    N, D_in, H, D_out = 64, 1000, 100, 10
+
+    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
+    x = Variable(torch.randn(N, D_in), requires_grad=False)
+    y = Variable(torch.randn(N, D_out), requires_grad=False)
+
+    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
+    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
+    parameters = [w1, w2]
+
+    learning_rate = 1e-6
+    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
+    loss_scaler = DynamicLossScaler()
+
+    for t in range(500):
+        y_pred = x.mm(w1).clamp(min=0).mm(w2)
+        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
+        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
+        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
+        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
+
+        # Run backprop
+        optimizer.zero_grad()
+        loss.backward()
+
+        # Check for overflow
+        has_overflow = DynamicLossScaler.has_overflow(parameters)
+
+        # If no overflow, unscale grad and update as usual
+        if not has_overflow:
+            for param in parameters:
+                param.grad.data.mul_(1. / loss_scaler.loss_scale)
+            optimizer.step()
+        # Otherwise, don't do anything -- ie, skip iteration
+        else:
+            print('OVERFLOW!')
+
+        # Update loss scale for next iteration
+        loss_scaler.update_scale(has_overflow)
+
+"""
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 02e87baa..255f6155 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -5,6 +5,7 @@ pycocotools>=2.0.4
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
+sacrebleu
 taming-transformers-rom1504
 timm
 tokenizers
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
new file mode 100644
index 00000000..06003625
--- /dev/null
+++ b/tests/trainers/test_ofa_trainer.py
@@ -0,0 +1,105 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import unittest
+
+import json
+
+from modelscope.metainfo import Metrics, Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestOfaTrainer(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.finetune_cfg = \
+            {'framework': 'pytorch',
+             'task': 'image-captioning',
+             'model': {'type': 'ofa',
+                       'beam_search': {'beam_size': 5,
+                                       'max_len_b': 16,
+                                       'min_len': 1,
+                                       'no_repeat_ngram_size': 0},
+                       'seed': 7,
+                       'max_src_length': 256,
+                       'language': 'en',
+                       'gen_type': 'generation',
+                       'patch_image_size': 480,
+                       'max_image_size': 480,
+                       'imagenet_default_mean_and_std': False},
+             'pipeline': {'type': 'image-captioning'},
+             'dataset': {'column_map': {'text': 'caption'}},
+             'train': {'work_dir': 'work/ckpts/caption',
+                       # 'launcher': 'pytorch',
+                       'max_epochs': 1,
+                       'use_fp16': True,
+                       'dataloader': {'batch_size_per_gpu': 1, 'workers_per_gpu': 0},
+                       'lr_scheduler': {'name': 'polynomial_decay',
+                                        'warmup_proportion': 0.01,
+                                        'lr_end': 1e-07},
+                       'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
+                       'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
+                       'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
+                                          'cumulative_iters': 1,
+                                          'grad_clip': {'max_norm': 1.0, 'norm_type': 2},
+                                          'loss_keys': 'loss'},
+                       'criterion': {'name': 'AdjustLabelSmoothedCrossEntropyCriterion',
+                                     'constraint_range': None,
+                                     'drop_worst_after': 0,
+                                     'drop_worst_ratio': 0.0,
+                                     'ignore_eos': False,
+                                     'ignore_prefix_size': 0,
+                                     'label_smoothing': 0.1,
+                                     'reg_alpha': 1.0,
+                                     'report_accuracy': False,
+                                     'sample_patch_num': 196,
+                                     'sentence_avg': False,
+                                     'use_rdrop': False},
+                       'hooks': [{'type': 'BestCkptSaverHook',
+                                  'metric_key': 'bleu-4',
+                                  'interval': 100},
+                                 {'type': 'TextLoggerHook', 'interval': 1},
+                                 {'type': 'IterTimerHook'},
+                                 {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
+             'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
+                            'metrics': [{'type': 'bleu',
+                                         'eval_tokenized_bleu': False,
+                                         'ref_name': 'labels',
+                                         'hyp_name': 'caption'}]},
+             'preprocessor': []}
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_std(self):
+        WORKSPACE = './workspace/ckpts/caption'
+        os.makedirs(WORKSPACE, exist_ok=True)
+        config_file = os.path.join(WORKSPACE, ModelFile.CONFIGURATION)
+        with open(config_file, 'w') as writer:
+            json.dump(self.finetune_cfg, writer)
+
+        pretrained_model = 'damo/ofa_image-caption_coco_distilled_en'
+        args = dict(
+            model=pretrained_model,
+            work_dir=WORKSPACE,
+            train_dataset=MsDataset.load(
+                'coco_2014_caption',
+                namespace='modelscope',
+                split='train[:20]'),
+            eval_dataset=MsDataset.load(
+                'coco_2014_caption',
+                namespace='modelscope',
+                split='validation[:10]'),
+            metrics=[Metrics.BLEU],
+            cfg_file=config_file)
+        trainer = build_trainer(name=Trainers.ofa, default_args=args)
+        trainer.train()
+
+        self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
+                      os.listdir(os.path.join(WORKSPACE, 'output')))
+        shutil.rmtree(WORKSPACE)
+
+
+if __name__ == '__main__':
+    unittest.main()

From ffd834fc258c02450064f5c8c4df6f0389226a4c Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Tue, 25 Oct 2022 12:58:02 +0800
Subject: [PATCH 115/129] [to #42322933] Add bloom model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加 bloom 模型
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10509187
---
 modelscope/metainfo.py                            |  1 +
 modelscope/models/nlp/bloom/backbone.py           | 15 +++++++++++++++
 .../models/nlp/task_models/text_generation.py     |  8 ++++----
 .../pipelines/nlp/text_generation_pipeline.py     |  2 +-
 tests/pipelines/test_text_generation.py           |  6 ++++++
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 modelscope/models/nlp/bloom/backbone.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index f77ff299..16190eb8 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -80,6 +80,7 @@ class Models(object):
     bert_for_ds = 'bert-for-document-segmentation'
     ponet = 'ponet'
     T5 = 'T5'
+    bloom = 'bloom'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
diff --git a/modelscope/models/nlp/bloom/backbone.py b/modelscope/models/nlp/bloom/backbone.py
new file mode 100644
index 00000000..b6bd315e
--- /dev/null
+++ b/modelscope/models/nlp/bloom/backbone.py
@@ -0,0 +1,15 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import BloomConfig
+from transformers import BloomModel as BloomModelTransform
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils.constant import Fields
+
+
+@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.bloom)
+class BloomModel(BloomModelTransform):
+
+    def __init__(self, **kwargs):
+        config = BloomConfig(**kwargs)
+        super().__init__(config)
diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py
index 973198ae..f17b0f6b 100644
--- a/modelscope/models/nlp/task_models/text_generation.py
+++ b/modelscope/models/nlp/task_models/text_generation.py
@@ -51,12 +51,9 @@ class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
         return addict.Dict(outputs)
 
     def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        token_type_ids = kwargs.get('token_type_ids', None)
         # only last token for inputs_ids if past is defined in kwargs
         if past:
             input_ids = input_ids[:, -1].unsqueeze(-1)
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
 
         attention_mask = kwargs.get('attention_mask', None)
         position_ids = kwargs.get('position_ids', None)
@@ -75,5 +72,8 @@ class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel):
             'use_cache': kwargs.get('use_cache'),
             'position_ids': position_ids,
             'attention_mask': attention_mask,
-            'token_type_ids': token_type_ids,
         }
+
+    def generate(self, inputs, *args, **kwargs):
+        input_ids = inputs['input_ids'] if isinstance(inputs, Dict) else inputs
+        return super().generate(input_ids, *args, **kwargs)
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index ae92f26a..28acebb4 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -79,7 +79,7 @@ class TextGenerationPipeline(Pipeline):
             return self.model.generate(inputs, **forward_params)
 
     def sentence_piece(self, inputs) -> Dict[str, Tensor]:
-        return self.preprocessor.tokenizer.decode(inputs.tolist())[0]
+        return self.preprocessor.tokenizer.decode(inputs.tolist()[0])
 
     def postprocess(self, inputs: Dict[str, Tensor],
                     **postprocess_params) -> Dict[str, str]:
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index 4b0ebd47..f624f021 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -182,6 +182,12 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
                 max_length=20,
                 repetition_penalty=0.5))
 
+    @unittest.skip("Langboat's checkpoint has not been uploaded to modelhub")
+    def test_bloom(self):
+        pipe = pipeline(
+            task=Tasks.text_generation, model='Langboat/bloom-1b4-zh')
+        print(pipe('中国的首都是'))
+
 
 if __name__ == '__main__':
     unittest.main()

From e1ab73b7d848a558f9498ac4f5d2277eb2752e5a Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 13:55:09 +0800
Subject: [PATCH 116/129] [to #42322933]support type str for for zero-shot
 labels' input         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10506320

---
 .../nlp/zero_shot_classification_pipeline.py          | 11 ++++++++++-
 tests/pipelines/test_zero_shot_classification.py      |  5 +++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
index 88792b45..ecd538b9 100644
--- a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -74,7 +74,8 @@ class ZeroShotClassificationPipeline(Pipeline):
         preprocess_params = {}
         postprocess_params = {}
         if 'candidate_labels' in kwargs:
-            candidate_labels = kwargs.pop('candidate_labels')
+            candidate_labels = self._parse_labels(
+                kwargs.pop('candidate_labels'))
             preprocess_params['candidate_labels'] = candidate_labels
             postprocess_params['candidate_labels'] = candidate_labels
         else:
@@ -84,6 +85,14 @@ class ZeroShotClassificationPipeline(Pipeline):
         postprocess_params['multi_label'] = kwargs.pop('multi_label', False)
         return preprocess_params, {}, postprocess_params
 
+    def _parse_labels(self, labels):
+        if isinstance(labels, str):
+            labels = labels.replace('，', ',')  # replace cn comma to en comma
+            labels = [
+                label.strip() for label in labels.split(',') if label.strip()
+            ]
+        return labels
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         return self.model(**inputs, **forward_params)
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
index da1854c9..6a98132a 100644
--- a/tests/pipelines/test_zero_shot_classification.py
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -21,6 +21,7 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
 
     sentence = '全新突破 解放军运20版空中加油机曝光'
     labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
+    labels_str = '文化, 体育, 娱乐, 财经, 家居, 汽车, 教育, 科技, 军事'
     template = '这篇文章的标题是{}'
     regress_tool = MsRegressTool(baseline=False)
 
@@ -40,6 +41,10 @@ class ZeroShotClassificationTest(unittest.TestCase, DemoCompatibilityCheck):
             f'sentence: {self.sentence}\n'
             f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}'
         )
+        print(
+            f'sentence: {self.sentence}\n'
+            f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels_str,hypothesis_template=self.template)}'
+        )
         print(
             f'sentence: {self.sentence}\n'
             f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}'

From 62339161cddb9bfa0f979f6354a37acfa11cbd2b Mon Sep 17 00:00:00 2001
From: "yichang.zyc" <yichang.zyc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 19:26:44 +0800
Subject: [PATCH 117/129] revert args of metric init         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10521235

---
 modelscope/metrics/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py
index 1b9db825..955946b5 100644
--- a/modelscope/metrics/base.py
+++ b/modelscope/metrics/base.py
@@ -10,6 +10,9 @@ class Metric(ABC):
     complex metrics for a specific task with or without other Metric subclasses.
     """
 
+    def __init__(self, *args, **kwargs):
+        pass
+
     @abstractmethod
     def add(self, outputs: Dict, inputs: Dict):
         """ Append logits and labels within an eval loop.

From 41b35619e83cc46d9592ecffbac1ff0d2ce2966a Mon Sep 17 00:00:00 2001
From: "lllcho.lc" <lllcho.lc@alibaba-inc.com>
Date: Tue, 25 Oct 2022 20:31:53 +0800
Subject: [PATCH 118/129] [to #42322933] Fix bug for demo service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

在demo service场景，同时调用同一个视频文件，会导致ffmpeg处理同名视频的冲突。通过uuid生成唯一的文件名解决这个冲突。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10518178
---
 .../models/cv/action_detection/action_detection_onnx.py    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/cv/action_detection/action_detection_onnx.py b/modelscope/models/cv/action_detection/action_detection_onnx.py
index 1c8be354..223d77f7 100644
--- a/modelscope/models/cv/action_detection/action_detection_onnx.py
+++ b/modelscope/models/cv/action_detection/action_detection_onnx.py
@@ -4,6 +4,7 @@ import os
 import os.path as osp
 import shutil
 import subprocess
+import uuid
 
 import cv2
 import numpy as np
@@ -84,7 +85,9 @@ class ActionDetONNX(Model):
     def forward_video(self, video_name, scale):
         min_size, max_size = self._get_sizes(scale)
 
-        tmp_dir = osp.join(self.tmp_dir, osp.basename(video_name)[:-4])
+        tmp_dir = osp.join(
+            self.tmp_dir,
+            str(uuid.uuid1()) + '_' + osp.basename(video_name)[:-4])
         if osp.exists(tmp_dir):
             shutil.rmtree(tmp_dir)
         os.makedirs(tmp_dir)
@@ -110,6 +113,7 @@ class ActionDetONNX(Model):
                   len(frame_names) * self.temporal_stride,
                   self.temporal_stride))
         batch_imgs = [self.parse_frames(names) for names in frame_names]
+        shutil.rmtree(tmp_dir)
 
         N, _, T, H, W = batch_imgs[0].shape
         scale_min = min_size / min(H, W)
@@ -128,7 +132,6 @@ class ActionDetONNX(Model):
             'timestamp': t,
             'actions': res
         } for t, res in zip(timestamp, results)]
-        shutil.rmtree(tmp_dir)
         return results
 
     def forward(self, video_name):

From c2da44b371d88f81e39fac4a0bbdfbd1573c6e21 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 25 Oct 2022 22:38:49 +0800
Subject: [PATCH 119/129] [to #42322933] remove dev model inference and fix
 some bugs

1. Change structbert dev revision to master revision
2. Fix bug:  Sample code failed because the updating of model configuration
3. Fix bug: Continue training regression failed
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10519992
---
 modelscope/models/builder.py                  | 10 +++++++--
 modelscope/models/nlp/__init__.py             |  2 ++
 modelscope/models/nlp/structbert/adv_utils.py |  2 +-
 modelscope/trainers/hooks/checkpoint_hook.py  | 15 +++++++------
 modelscope/trainers/trainer.py                | 21 ++++++++++++-------
 tests/pipelines/test_fill_mask.py             |  7 ++-----
 .../test_sentiment_classification.py          | 13 +++++-------
 tests/trainers/test_trainer_with_nlp.py       | 15 ++++++-------
 8 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index a35358c1..2804c6c7 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -2,13 +2,19 @@
 
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.constant import Tasks
+from modelscope.utils.import_utils import INDEX_KEY, LazyImportModule
 from modelscope.utils.registry import TYPE_NAME, Registry, build_from_cfg
 
 MODELS = Registry('models')
-BACKBONES = Registry('backbones')
-BACKBONES._modules = MODELS._modules
+BACKBONES = MODELS
 HEADS = Registry('heads')
 
+modules = LazyImportModule.AST_INDEX[INDEX_KEY]
+for module_index in list(modules.keys()):
+    if module_index[1] == Tasks.backbone and module_index[0] == 'BACKBONES':
+        modules[(MODELS.name.upper(), module_index[1],
+                 module_index[2])] = modules[module_index]
+
 
 def build_model(cfg: ConfigDict,
                 task_name: str = None,
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index dff42d1c..5ae93caa 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -19,6 +19,7 @@ if TYPE_CHECKING:
         SbertForSequenceClassification,
         SbertForTokenClassification,
         SbertTokenizer,
+        SbertModel,
         SbertTokenizerFast,
     )
     from .bert import (
@@ -61,6 +62,7 @@ else:
             'SbertForTokenClassification',
             'SbertTokenizer',
             'SbertTokenizerFast',
+            'SbertModel',
         ],
         'veco': [
             'VecoModel', 'VecoConfig', 'VecoForTokenClassification',
diff --git a/modelscope/models/nlp/structbert/adv_utils.py b/modelscope/models/nlp/structbert/adv_utils.py
index 44aae85c..91a4cb82 100644
--- a/modelscope/models/nlp/structbert/adv_utils.py
+++ b/modelscope/models/nlp/structbert/adv_utils.py
@@ -98,7 +98,7 @@ def compute_adv_loss(embedding,
     if is_nan:
         logger.warning('Nan occured when calculating adv loss.')
         return ori_loss
-    emb_grad = emb_grad / emb_grad_norm
+    emb_grad = emb_grad / (emb_grad_norm + 1e-6)
     embedding_2 = embedding_1 + adv_grad_factor * emb_grad
     embedding_2 = torch.max(embedding_1 - adv_bound, embedding_2)
     embedding_2 = torch.min(embedding_1 + adv_bound, embedding_2)
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index c9f51a88..9b86d5b5 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -69,7 +69,7 @@ class CheckpointHook(Hook):
             self.rng_state = meta.get('rng_state')
             self.need_load_rng_state = True
 
-    def before_train_epoch(self, trainer):
+    def before_train_iter(self, trainer):
         if self.need_load_rng_state:
             if self.rng_state is not None:
                 random.setstate(self.rng_state['random'])
@@ -84,13 +84,6 @@ class CheckpointHook(Hook):
                     'this may cause a random data order or model initialization.'
                 )
 
-        self.rng_state = {
-            'random': random.getstate(),
-            'numpy': np.random.get_state(),
-            'cpu': torch.random.get_rng_state(),
-            'cuda': torch.cuda.get_rng_state_all(),
-        }
-
     def after_train_epoch(self, trainer):
         if not self.by_epoch:
             return
@@ -142,6 +135,12 @@ class CheckpointHook(Hook):
             cur_save_name = os.path.join(
                 self.save_dir, f'{LogKeys.ITER}_{trainer.iter + 1}.pth')
 
+        self.rng_state = {
+            'random': random.getstate(),
+            'numpy': np.random.get_state(),
+            'cpu': torch.random.get_rng_state(),
+            'cuda': torch.cuda.get_rng_state_all(),
+        }
         meta = {
             'epoch': trainer.epoch,
             'iter': trainer.iter + 1,
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index f47bff10..605136e5 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -354,6 +354,9 @@ class EpochBasedTrainer(BaseTrainer):
                 task_dataset.trainer = self
                 return task_dataset
             else:
+                if task_data_config is None:
+                    # adapt to some special models
+                    task_data_config = {}
                 # avoid add no str value datasets, preprocessors in cfg
                 task_data_build_config = ConfigDict(
                     type=self.cfg.model.type,
@@ -419,13 +422,17 @@ class EpochBasedTrainer(BaseTrainer):
         return metrics
 
     def set_checkpoint_file_to_hook(self, checkpoint_path):
-        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
-            from modelscope.trainers.hooks import CheckpointHook
-            checkpoint_hooks = list(
-                filter(lambda hook: isinstance(hook, CheckpointHook),
-                       self.hooks))
-            for hook in checkpoint_hooks:
-                hook.checkpoint_file = checkpoint_path
+        if checkpoint_path is not None:
+            if os.path.isfile(checkpoint_path):
+                from modelscope.trainers.hooks import CheckpointHook
+                checkpoint_hooks = list(
+                    filter(lambda hook: isinstance(hook, CheckpointHook),
+                           self.hooks))
+                for hook in checkpoint_hooks:
+                    hook.checkpoint_file = checkpoint_path
+            else:
+                self.logger.error(
+                    f'No {checkpoint_path} found in local file system.')
 
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 568865c6..35202b88 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -83,7 +83,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # bert
         language = 'zh'
-        model_dir = snapshot_download(self.model_id_bert, revision='beta')
+        model_dir = snapshot_download(self.model_id_bert)
         preprocessor = NLPPreprocessor(
             model_dir, first_sequence='sentence', second_sequence=None)
         model = Model.from_pretrained(model_dir)
@@ -149,10 +149,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
 
         # Bert
         language = 'zh'
-        pipeline_ins = pipeline(
-            task=Tasks.fill_mask,
-            model=self.model_id_bert,
-            model_revision='beta')
+        pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
         print(
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
index b3d9b9d6..5c8d4e93 100644
--- a/tests/pipelines/test_sentiment_classification.py
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -24,10 +24,10 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_file_download(self):
-        cache_path = snapshot_download(self.model_id, revision='beta')
+        cache_path = snapshot_download(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(cache_path)
         model = SequenceClassificationModel.from_pretrained(
-            self.model_id, num_labels=2, revision='beta')
+            self.model_id, num_labels=2)
         pipeline1 = TextClassificationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.text_classification, model=model, preprocessor=tokenizer)
@@ -38,7 +38,7 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id, revision='beta')
+        model = Model.from_pretrained(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(model.model_dir)
         pipeline_ins = pipeline(
             task=Tasks.text_classification,
@@ -51,17 +51,14 @@ class SentimentClassificationTaskModelTest(unittest.TestCase,
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
-            task=Tasks.text_classification,
-            model=self.model_id,
-            model_revision='beta')
+            task=Tasks.text_classification, model=self.model_id)
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(
-            task=Tasks.text_classification, model_revision='beta')
+        pipeline_ins = pipeline(task=Tasks.text_classification)
         print(pipeline_ins(input=self.sentence1))
         self.assertTrue(
             isinstance(pipeline_ins.model, SequenceClassificationModel))
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 9380ad0f..8aaa42a3 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -37,13 +37,12 @@ class TestTrainerWithNlp(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_trainer(self):
-        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        model_id = 'damo/nlp_structbert_sentence-similarity_chinese-tiny'
         kwargs = dict(
             model=model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            work_dir=self.tmp_dir,
-            model_revision='beta')
+            work_dir=self.tmp_dir)
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
@@ -80,8 +79,7 @@ class TestTrainerWithNlp(unittest.TestCase):
             model=model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            work_dir=self.tmp_dir,
-            model_revision='beta')
+            work_dir=self.tmp_dir)
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
@@ -97,7 +95,7 @@ class TestTrainerWithNlp(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_user_defined_config(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
-        cfg = read_config(model_id, revision='beta')
+        cfg = read_config(model_id)
         cfg.train.max_epochs = 20
         cfg.preprocessor.train['label2id'] = {'0': 0, '1': 1}
         cfg.preprocessor.val['label2id'] = {'0': 0, '1': 1}
@@ -108,8 +106,7 @@ class TestTrainerWithNlp(unittest.TestCase):
             model=model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            cfg_file=cfg_file,
-            model_revision='beta')
+            cfg_file=cfg_file)
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
@@ -233,7 +230,7 @@ class TestTrainerWithNlp(unittest.TestCase):
             os.makedirs(tmp_dir)
 
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
-        cache_path = snapshot_download(model_id, revision='beta')
+        cache_path = snapshot_download(model_id)
         model = SbertForSequenceClassification.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),

From d40cc98994d7c5150b39373d6678f9eb895efb88 Mon Sep 17 00:00:00 2001
From: "tingwei.gtw" <tingwei.gtw@alibaba-inc.com>
Date: Tue, 25 Oct 2022 22:49:15 +0800
Subject: [PATCH 120/129] [to #42322933] update IO for demo services
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修改了I/O的代码，以支持modelscope的demo services
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10518318
---
 .../models/cv/face_emotion/emotion_infer.py    |  6 +++---
 .../cv/face_human_hand_detection/det_infer.py  | 11 ++++++++---
 modelscope/models/cv/hand_static/hand_model.py |  8 ++++----
 .../cv/product_segmentation/seg_infer.py       |  5 ++---
 modelscope/outputs/outputs.py                  | 11 ++++++-----
 .../pipelines/cv/face_emotion_pipeline.py      |  8 ++++++--
 .../cv/face_human_hand_detection_pipeline.py   | 18 +++++++++++++-----
 .../pipelines/cv/hand_static_pipeline.py       |  8 ++++++--
 .../cv/product_segmentation_pipeline.py        | 10 +++++++---
 9 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/modelscope/models/cv/face_emotion/emotion_infer.py b/modelscope/models/cv/face_emotion/emotion_infer.py
index e3398592..618822ff 100644
--- a/modelscope/models/cv/face_emotion/emotion_infer.py
+++ b/modelscope/models/cv/face_emotion/emotion_infer.py
@@ -25,9 +25,9 @@ emotion_list = [
 ]
 
 
-def inference(image_path, model, face_model, score_thre=0.5, GPU=0):
-    image = Image.open(image_path).convert('RGB')
-
+def inference(image, model, face_model, score_thre=0.5, GPU=0):
+    image = image.cpu().numpy()
+    image = Image.fromarray(image)
     face, bbox = face_detection_PIL_v2(image, face_model)
     if bbox is None:
         logger.warn('no face detected!')
diff --git a/modelscope/models/cv/face_human_hand_detection/det_infer.py b/modelscope/models/cv/face_human_hand_detection/det_infer.py
index 7a7225ee..6822bd9f 100644
--- a/modelscope/models/cv/face_human_hand_detection/det_infer.py
+++ b/modelscope/models/cv/face_human_hand_detection/det_infer.py
@@ -115,9 +115,9 @@ std = [57.375, 57.12, 58.395]
 class_names = ['person', 'face', 'hand']
 
 
-def inference(model, device, img_path):
+def inference(model, device, img):
+    img = img.cpu().numpy()
     img_info = {'id': 0}
-    img = cv2.imread(img_path)
     height, width = img.shape[:2]
     img_info['height'] = height
     img_info['width'] = width
@@ -130,4 +130,9 @@ def inference(model, device, img_path):
     with torch.no_grad():
         res = model(meta)
     result = overlay_bbox_cv(res[0], class_names, score_thresh=0.35)
-    return result
+    cls_list, bbox_list, score_list = [], [], []
+    for pred in result:
+        cls_list.append(pred[0])
+        bbox_list.append([pred[1], pred[2], pred[3], pred[4]])
+        score_list.append(pred[5])
+    return cls_list, bbox_list, score_list
diff --git a/modelscope/models/cv/hand_static/hand_model.py b/modelscope/models/cv/hand_static/hand_model.py
index 38517307..7a8a323e 100644
--- a/modelscope/models/cv/hand_static/hand_model.py
+++ b/modelscope/models/cv/hand_static/hand_model.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
-from torchvision.transforms import transforms
+from torchvision import transforms
 
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
@@ -80,9 +80,9 @@ class HandStatic(TorchModel):
         return pred_result
 
 
-def infer(img_path, model, device):
-
-    img = Image.open(img_path)
+def infer(img, model, device):
+    img = img.cpu().numpy()
+    img = Image.fromarray(img)
     clip = spatial_transform(img)
     clip = clip.unsqueeze(0).to(device).float()
     outputs = model(clip)
diff --git a/modelscope/models/cv/product_segmentation/seg_infer.py b/modelscope/models/cv/product_segmentation/seg_infer.py
index 876fac66..8814d619 100644
--- a/modelscope/models/cv/product_segmentation/seg_infer.py
+++ b/modelscope/models/cv/product_segmentation/seg_infer.py
@@ -59,9 +59,8 @@ mean, std = np.array([[[124.55, 118.90,
                         102.94]]]), np.array([[[56.77, 55.97, 57.50]]])
 
 
-def inference(model, device, input_path):
-    img = Image.open(input_path)
-    img = np.array(img.convert('RGB')).astype(np.float32)
+def inference(model, device, img):
+    img = img.cpu().numpy()
     img = (img - mean) / std
     img = cv2.resize(img, dsize=(448, 448), interpolation=cv2.INTER_LINEAR)
     img = torch.from_numpy(img)
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index 721fb271..cbdeede4 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -762,12 +762,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.hand_static: [OutputKeys.OUTPUT],
 
-    #     'output': [
-    #                [2, 75, 287, 240, 510, 0.8335018754005432],
-    #                [1, 127, 83, 332, 366, 0.9175254702568054],
-    #                [0, 0, 0, 367, 639, 0.9693422317504883]]
+    # {    'labels': [2, 1, 0],
+    #      'boxes':[[[78, 282, 240, 504], [127, 87, 332, 370], [0, 0, 367, 639]]
+    #      'scores':[0.8202137351036072, 0.8987470269203186, 0.9679114818572998]
     # }
-    Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],
+    Tasks.face_human_hand_detection: [
+        OutputKeys.LABELS, OutputKeys.BOXES, OutputKeys.SCORES
+    ],
 
     # {
     #   {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
diff --git a/modelscope/pipelines/cv/face_emotion_pipeline.py b/modelscope/pipelines/cv/face_emotion_pipeline.py
index 249493b6..9d9aa6ee 100644
--- a/modelscope/pipelines/cv/face_emotion_pipeline.py
+++ b/modelscope/pipelines/cv/face_emotion_pipeline.py
@@ -1,11 +1,14 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_emotion import emotion_infer
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 
@@ -28,10 +31,11 @@ class FaceEmotionPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['img_path'])
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        result, bbox = emotion_infer.inference(input['img_path'], self.model,
+        result, bbox = emotion_infer.inference(input, self.model,
                                                self.face_model)
         return {OutputKeys.OUTPUT: result, OutputKeys.BOXES: bbox}
 
diff --git a/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
index d9f214c9..d41a14dd 100644
--- a/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
+++ b/modelscope/pipelines/cv/face_human_hand_detection_pipeline.py
@@ -2,11 +2,14 @@
 
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.face_human_hand_detection import det_infer
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -29,14 +32,19 @@ class NanoDettForFaceHumanHandDetectionPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['input_path'])
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
 
-        result = det_infer.inference(self.model, self.device,
-                                     input['input_path'])
-        logger.info(result)
-        return {OutputKeys.OUTPUT: result}
+        cls_list, bbox_list, score_list = det_infer.inference(
+            self.model, self.device, input)
+        logger.info(cls_list, bbox_list, score_list)
+        return {
+            OutputKeys.LABELS: cls_list,
+            OutputKeys.BOXES: bbox_list,
+            OutputKeys.SCORES: score_list
+        }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/cv/hand_static_pipeline.py b/modelscope/pipelines/cv/hand_static_pipeline.py
index 1219c873..c020b7aa 100644
--- a/modelscope/pipelines/cv/hand_static_pipeline.py
+++ b/modelscope/pipelines/cv/hand_static_pipeline.py
@@ -1,11 +1,14 @@
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.hand_static import hand_model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -27,10 +30,11 @@ class HandStaticPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['img_path'])
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        result = hand_model.infer(input['img_path'], self.model, self.device)
+        result = hand_model.infer(input, self.model, self.device)
         return {OutputKeys.OUTPUT: result}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/product_segmentation_pipeline.py b/modelscope/pipelines/cv/product_segmentation_pipeline.py
index 244b01d7..3b1b2381 100644
--- a/modelscope/pipelines/cv/product_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/product_segmentation_pipeline.py
@@ -2,11 +2,14 @@
 
 from typing import Any, Dict
 
+import numpy as np
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.product_segmentation import seg_infer
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -28,12 +31,13 @@ class F3NetForProductSegmentationPipeline(Pipeline):
         logger.info('load model done')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        return input
+        img = LoadImage.convert_to_ndarray(input['input_path'])
+        img = img.astype(np.float32)
+        return img
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
 
-        mask = seg_infer.inference(self.model, self.device,
-                                   input['input_path'])
+        mask = seg_infer.inference(self.model, self.device, input)
         return {OutputKeys.MASKS: mask}
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

From ba3db0f552b408e4196bb033a632070518b88078 Mon Sep 17 00:00:00 2001
From: "siyang.ssy" <siyang.ssy@alibaba-inc.com>
Date: Tue, 25 Oct 2022 22:56:14 +0800
Subject: [PATCH 121/129] [to #42322933] fix video embedding output        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10525516

---
 .../multi_modal/mmr/models/clip_for_mm_video_embedding.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index f1b1a6c7..0cc040c6 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -236,8 +236,10 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
         logger.info('text feature: {}'.format(sequence_output[0][0][0]))
         logger.info('video feature: {}'.format(visual_output[0][0][0]))
 
-        output[OutputKeys.VIDEO_EMBEDDING] = visual_output
-        output[OutputKeys.TEXT_EMBEDDING] = sequence_output
+        output[
+            OutputKeys.VIDEO_EMBEDDING] = visual_output.cpu().detach().numpy()
+        output[OutputKeys.TEXT_EMBEDDING] = sequence_output.cpu().detach(
+        ).numpy()
         return output
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

From bab54bbce865a4655010457c8f0050665395a09f Mon Sep 17 00:00:00 2001
From: "yuanzheng.yuanzhen" <yuanzheng.yuanzhen@alibaba-inc.com>
Date: Tue, 25 Oct 2022 22:59:19 +0800
Subject: [PATCH 122/129] [to #42322933]support uni fold         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10481410

---
 modelscope/metainfo.py                        |   10 +
 modelscope/models/science/__init__.py         |   21 +
 modelscope/models/science/unifold/__init__.py |    1 +
 modelscope/models/science/unifold/config.py   |  636 ++++++++
 .../models/science/unifold/data/__init__.py   |   14 +
 .../models/science/unifold/data/data_ops.py   | 1397 +++++++++++++++++
 .../science/unifold/data/msa_pairing.py       |  526 +++++++
 .../models/science/unifold/data/process.py    |  264 ++++
 .../science/unifold/data/process_multimer.py  |  417 +++++
 .../models/science/unifold/data/protein.py    |  322 ++++
 .../science/unifold/data/residue_constants.py | 1212 ++++++++++++++
 .../unifold/data/stereo_chemical_props.txt    |  345 ++++
 .../models/science/unifold/data/utils.py      |  161 ++
 modelscope/models/science/unifold/dataset.py  |  514 ++++++
 modelscope/models/science/unifold/model.py    |   75 +
 .../science/unifold/modules/alphafold.py      |  450 ++++++
 .../science/unifold/modules/attentions.py     |  430 +++++
 .../unifold/modules/auxillary_heads.py        |  171 ++
 .../models/science/unifold/modules/common.py  |  387 +++++
 .../science/unifold/modules/confidence.py     |  159 ++
 .../science/unifold/modules/embedders.py      |  290 ++++
 .../science/unifold/modules/evoformer.py      |  362 +++++
 .../science/unifold/modules/featurization.py  |  195 +++
 .../models/science/unifold/modules/frame.py   |  562 +++++++
 .../unifold/modules/structure_module.py       |  592 +++++++
 .../science/unifold/modules/template.py       |  330 ++++
 .../modules/triangle_multiplication.py        |  158 ++
 .../models/science/unifold/msa/__init__.py    |    1 +
 .../models/science/unifold/msa/mmcif.py       |  483 ++++++
 .../science/unifold/msa/msa_identifiers.py    |   88 ++
 .../models/science/unifold/msa/parsers.py     |  627 ++++++++
 .../models/science/unifold/msa/pipeline.py    |  282 ++++
 .../models/science/unifold/msa/templates.py   | 1110 +++++++++++++
 .../science/unifold/msa/tools/__init__.py     |   14 +
 .../science/unifold/msa/tools/hhblits.py      |  170 ++
 .../science/unifold/msa/tools/hhsearch.py     |  111 ++
 .../science/unifold/msa/tools/hmmbuild.py     |  143 ++
 .../science/unifold/msa/tools/hmmsearch.py    |  146 ++
 .../science/unifold/msa/tools/jackhmmer.py    |  224 +++
 .../science/unifold/msa/tools/kalign.py       |  110 ++
 .../models/science/unifold/msa/tools/utils.py |   40 +
 .../models/science/unifold/msa/utils.py       |   89 ++
 modelscope/pipelines/science/__init__.py      |   22 +
 .../science/protein_structure_pipeline.py     |  215 +++
 modelscope/preprocessors/science/__init__.py  |   20 +
 modelscope/preprocessors/science/uni_fold.py  |  569 +++++++
 modelscope/utils/constant.py                  |   11 +-
 requirements/science.txt                      |    6 +
 tests/pipelines/test_unifold.py               |   34 +
 49 files changed, 14515 insertions(+), 1 deletion(-)
 create mode 100644 modelscope/models/science/__init__.py
 create mode 100644 modelscope/models/science/unifold/__init__.py
 create mode 100644 modelscope/models/science/unifold/config.py
 create mode 100644 modelscope/models/science/unifold/data/__init__.py
 create mode 100644 modelscope/models/science/unifold/data/data_ops.py
 create mode 100644 modelscope/models/science/unifold/data/msa_pairing.py
 create mode 100644 modelscope/models/science/unifold/data/process.py
 create mode 100644 modelscope/models/science/unifold/data/process_multimer.py
 create mode 100644 modelscope/models/science/unifold/data/protein.py
 create mode 100644 modelscope/models/science/unifold/data/residue_constants.py
 create mode 100644 modelscope/models/science/unifold/data/stereo_chemical_props.txt
 create mode 100644 modelscope/models/science/unifold/data/utils.py
 create mode 100644 modelscope/models/science/unifold/dataset.py
 create mode 100644 modelscope/models/science/unifold/model.py
 create mode 100644 modelscope/models/science/unifold/modules/alphafold.py
 create mode 100644 modelscope/models/science/unifold/modules/attentions.py
 create mode 100644 modelscope/models/science/unifold/modules/auxillary_heads.py
 create mode 100644 modelscope/models/science/unifold/modules/common.py
 create mode 100644 modelscope/models/science/unifold/modules/confidence.py
 create mode 100644 modelscope/models/science/unifold/modules/embedders.py
 create mode 100644 modelscope/models/science/unifold/modules/evoformer.py
 create mode 100644 modelscope/models/science/unifold/modules/featurization.py
 create mode 100644 modelscope/models/science/unifold/modules/frame.py
 create mode 100644 modelscope/models/science/unifold/modules/structure_module.py
 create mode 100644 modelscope/models/science/unifold/modules/template.py
 create mode 100644 modelscope/models/science/unifold/modules/triangle_multiplication.py
 create mode 100644 modelscope/models/science/unifold/msa/__init__.py
 create mode 100644 modelscope/models/science/unifold/msa/mmcif.py
 create mode 100644 modelscope/models/science/unifold/msa/msa_identifiers.py
 create mode 100644 modelscope/models/science/unifold/msa/parsers.py
 create mode 100644 modelscope/models/science/unifold/msa/pipeline.py
 create mode 100644 modelscope/models/science/unifold/msa/templates.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/__init__.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/hhblits.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/hhsearch.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/hmmbuild.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/hmmsearch.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/jackhmmer.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/kalign.py
 create mode 100644 modelscope/models/science/unifold/msa/tools/utils.py
 create mode 100644 modelscope/models/science/unifold/msa/utils.py
 create mode 100644 modelscope/pipelines/science/__init__.py
 create mode 100644 modelscope/pipelines/science/protein_structure_pipeline.py
 create mode 100644 modelscope/preprocessors/science/__init__.py
 create mode 100644 modelscope/preprocessors/science/uni_fold.py
 create mode 100644 requirements/science.txt
 create mode 100644 tests/pipelines/test_unifold.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 16190eb8..c5067c39 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -99,6 +99,10 @@ class Models(object):
     team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
 
+    # science models
+    unifold = 'unifold'
+    unifold_symmetry = 'unifold-symmetry'
+
 
 class TaskModels(object):
     # nlp task
@@ -266,6 +270,9 @@ class Pipelines(object):
     image_text_retrieval = 'image-text-retrieval'
     ofa_ocr_recognition = 'ofa-ocr-recognition'
 
+    # science tasks
+    protein_structure = 'unifold-protein-structure'
+
 
 class Trainers(object):
     """ Names for different trainer.
@@ -368,6 +375,9 @@ class Preprocessors(object):
     ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
     mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
 
+    # science preprocessor
+    unifold_preprocessor = 'unifold-preprocessor'
+
 
 class Metrics(object):
     """ Names for different metrics.
diff --git a/modelscope/models/science/__init__.py b/modelscope/models/science/__init__.py
new file mode 100644
index 00000000..50ab55d7
--- /dev/null
+++ b/modelscope/models/science/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .unifold import UnifoldForProteinStructrue
+
+else:
+    _import_structure = {'unifold': ['UnifoldForProteinStructrue']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/science/unifold/__init__.py b/modelscope/models/science/unifold/__init__.py
new file mode 100644
index 00000000..75435fed
--- /dev/null
+++ b/modelscope/models/science/unifold/__init__.py
@@ -0,0 +1 @@
+from .model import UnifoldForProteinStructrue
diff --git a/modelscope/models/science/unifold/config.py b/modelscope/models/science/unifold/config.py
new file mode 100644
index 00000000..e760fbf9
--- /dev/null
+++ b/modelscope/models/science/unifold/config.py
@@ -0,0 +1,636 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import copy
+from typing import Any
+
+import ml_collections as mlc
+
+N_RES = 'number of residues'
+N_MSA = 'number of MSA sequences'
+N_EXTRA_MSA = 'number of extra MSA sequences'
+N_TPL = 'number of templates'
+
+d_pair = mlc.FieldReference(128, field_type=int)
+d_msa = mlc.FieldReference(256, field_type=int)
+d_template = mlc.FieldReference(64, field_type=int)
+d_extra_msa = mlc.FieldReference(64, field_type=int)
+d_single = mlc.FieldReference(384, field_type=int)
+max_recycling_iters = mlc.FieldReference(3, field_type=int)
+chunk_size = mlc.FieldReference(4, field_type=int)
+aux_distogram_bins = mlc.FieldReference(64, field_type=int)
+eps = mlc.FieldReference(1e-8, field_type=float)
+inf = mlc.FieldReference(3e4, field_type=float)
+use_templates = mlc.FieldReference(True, field_type=bool)
+is_multimer = mlc.FieldReference(False, field_type=bool)
+
+
+def base_config():
+    return mlc.ConfigDict({
+        'data': {
+            'common': {
+                'features': {
+                    'aatype': [N_RES],
+                    'all_atom_mask': [N_RES, None],
+                    'all_atom_positions': [N_RES, None, None],
+                    'alt_chi_angles': [N_RES, None],
+                    'atom14_alt_gt_exists': [N_RES, None],
+                    'atom14_alt_gt_positions': [N_RES, None, None],
+                    'atom14_atom_exists': [N_RES, None],
+                    'atom14_atom_is_ambiguous': [N_RES, None],
+                    'atom14_gt_exists': [N_RES, None],
+                    'atom14_gt_positions': [N_RES, None, None],
+                    'atom37_atom_exists': [N_RES, None],
+                    'frame_mask': [N_RES],
+                    'true_frame_tensor': [N_RES, None, None],
+                    'bert_mask': [N_MSA, N_RES],
+                    'chi_angles_sin_cos': [N_RES, None, None],
+                    'chi_mask': [N_RES, None],
+                    'extra_msa_deletion_value': [N_EXTRA_MSA, N_RES],
+                    'extra_msa_has_deletion': [N_EXTRA_MSA, N_RES],
+                    'extra_msa': [N_EXTRA_MSA, N_RES],
+                    'extra_msa_mask': [N_EXTRA_MSA, N_RES],
+                    'extra_msa_row_mask': [N_EXTRA_MSA],
+                    'is_distillation': [],
+                    'msa_feat': [N_MSA, N_RES, None],
+                    'msa_mask': [N_MSA, N_RES],
+                    'msa_chains': [N_MSA, None],
+                    'msa_row_mask': [N_MSA],
+                    'num_recycling_iters': [],
+                    'pseudo_beta': [N_RES, None],
+                    'pseudo_beta_mask': [N_RES],
+                    'residue_index': [N_RES],
+                    'residx_atom14_to_atom37': [N_RES, None],
+                    'residx_atom37_to_atom14': [N_RES, None],
+                    'resolution': [],
+                    'rigidgroups_alt_gt_frames': [N_RES, None, None, None],
+                    'rigidgroups_group_exists': [N_RES, None],
+                    'rigidgroups_group_is_ambiguous': [N_RES, None],
+                    'rigidgroups_gt_exists': [N_RES, None],
+                    'rigidgroups_gt_frames': [N_RES, None, None, None],
+                    'seq_length': [],
+                    'seq_mask': [N_RES],
+                    'target_feat': [N_RES, None],
+                    'template_aatype': [N_TPL, N_RES],
+                    'template_all_atom_mask': [N_TPL, N_RES, None],
+                    'template_all_atom_positions': [N_TPL, N_RES, None, None],
+                    'template_alt_torsion_angles_sin_cos': [
+                        N_TPL,
+                        N_RES,
+                        None,
+                        None,
+                    ],
+                    'template_frame_mask': [N_TPL, N_RES],
+                    'template_frame_tensor': [N_TPL, N_RES, None, None],
+                    'template_mask': [N_TPL],
+                    'template_pseudo_beta': [N_TPL, N_RES, None],
+                    'template_pseudo_beta_mask': [N_TPL, N_RES],
+                    'template_sum_probs': [N_TPL, None],
+                    'template_torsion_angles_mask': [N_TPL, N_RES, None],
+                    'template_torsion_angles_sin_cos':
+                    [N_TPL, N_RES, None, None],
+                    'true_msa': [N_MSA, N_RES],
+                    'use_clamped_fape': [],
+                    'assembly_num_chains': [1],
+                    'asym_id': [N_RES],
+                    'sym_id': [N_RES],
+                    'entity_id': [N_RES],
+                    'num_sym': [N_RES],
+                    'asym_len': [None],
+                    'cluster_bias_mask': [N_MSA],
+                },
+                'masked_msa': {
+                    'profile_prob': 0.1,
+                    'same_prob': 0.1,
+                    'uniform_prob': 0.1,
+                },
+                'block_delete_msa': {
+                    'msa_fraction_per_block': 0.3,
+                    'randomize_num_blocks': False,
+                    'num_blocks': 5,
+                    'min_num_msa': 16,
+                },
+                'random_delete_msa': {
+                    'max_msa_entry': 1 << 25,  # := 33554432
+                },
+                'v2_feature':
+                False,
+                'gumbel_sample':
+                False,
+                'max_extra_msa':
+                1024,
+                'msa_cluster_features':
+                True,
+                'reduce_msa_clusters_by_max_templates':
+                True,
+                'resample_msa_in_recycling':
+                True,
+                'template_features': [
+                    'template_all_atom_positions',
+                    'template_sum_probs',
+                    'template_aatype',
+                    'template_all_atom_mask',
+                ],
+                'unsupervised_features': [
+                    'aatype',
+                    'residue_index',
+                    'msa',
+                    'msa_chains',
+                    'num_alignments',
+                    'seq_length',
+                    'between_segment_residues',
+                    'deletion_matrix',
+                    'num_recycling_iters',
+                    'crop_and_fix_size_seed',
+                ],
+                'recycling_features': [
+                    'msa_chains',
+                    'msa_mask',
+                    'msa_row_mask',
+                    'bert_mask',
+                    'true_msa',
+                    'msa_feat',
+                    'extra_msa_deletion_value',
+                    'extra_msa_has_deletion',
+                    'extra_msa',
+                    'extra_msa_mask',
+                    'extra_msa_row_mask',
+                    'is_distillation',
+                ],
+                'multimer_features': [
+                    'assembly_num_chains',
+                    'asym_id',
+                    'sym_id',
+                    'num_sym',
+                    'entity_id',
+                    'asym_len',
+                    'cluster_bias_mask',
+                ],
+                'use_templates':
+                use_templates,
+                'is_multimer':
+                is_multimer,
+                'use_template_torsion_angles':
+                use_templates,
+                'max_recycling_iters':
+                max_recycling_iters,
+            },
+            'supervised': {
+                'use_clamped_fape_prob':
+                1.0,
+                'supervised_features': [
+                    'all_atom_mask',
+                    'all_atom_positions',
+                    'resolution',
+                    'use_clamped_fape',
+                    'is_distillation',
+                ],
+            },
+            'predict': {
+                'fixed_size': True,
+                'subsample_templates': False,
+                'block_delete_msa': False,
+                'random_delete_msa': True,
+                'masked_msa_replace_fraction': 0.15,
+                'max_msa_clusters': 128,
+                'max_templates': 4,
+                'num_ensembles': 2,
+                'crop': False,
+                'crop_size': None,
+                'supervised': False,
+                'biased_msa_by_chain': False,
+                'share_mask': False,
+            },
+            'eval': {
+                'fixed_size': True,
+                'subsample_templates': False,
+                'block_delete_msa': False,
+                'random_delete_msa': True,
+                'masked_msa_replace_fraction': 0.15,
+                'max_msa_clusters': 128,
+                'max_templates': 4,
+                'num_ensembles': 1,
+                'crop': False,
+                'crop_size': None,
+                'spatial_crop_prob': 0.5,
+                'ca_ca_threshold': 10.0,
+                'supervised': True,
+                'biased_msa_by_chain': False,
+                'share_mask': False,
+            },
+            'train': {
+                'fixed_size': True,
+                'subsample_templates': True,
+                'block_delete_msa': True,
+                'random_delete_msa': True,
+                'masked_msa_replace_fraction': 0.15,
+                'max_msa_clusters': 128,
+                'max_templates': 4,
+                'num_ensembles': 1,
+                'crop': True,
+                'crop_size': 256,
+                'spatial_crop_prob': 0.5,
+                'ca_ca_threshold': 10.0,
+                'supervised': True,
+                'use_clamped_fape_prob': 1.0,
+                'max_distillation_msa_clusters': 1000,
+                'biased_msa_by_chain': True,
+                'share_mask': True,
+            },
+        },
+        'globals': {
+            'chunk_size': chunk_size,
+            'block_size': None,
+            'd_pair': d_pair,
+            'd_msa': d_msa,
+            'd_template': d_template,
+            'd_extra_msa': d_extra_msa,
+            'd_single': d_single,
+            'eps': eps,
+            'inf': inf,
+            'max_recycling_iters': max_recycling_iters,
+            'alphafold_original_mode': False,
+        },
+        'model': {
+            'is_multimer': is_multimer,
+            'input_embedder': {
+                'tf_dim': 22,
+                'msa_dim': 49,
+                'd_pair': d_pair,
+                'd_msa': d_msa,
+                'relpos_k': 32,
+                'max_relative_chain': 2,
+            },
+            'recycling_embedder': {
+                'd_pair': d_pair,
+                'd_msa': d_msa,
+                'min_bin': 3.25,
+                'max_bin': 20.75,
+                'num_bins': 15,
+                'inf': 1e8,
+            },
+            'template': {
+                'distogram': {
+                    'min_bin': 3.25,
+                    'max_bin': 50.75,
+                    'num_bins': 39,
+                },
+                'template_angle_embedder': {
+                    'd_in': 57,
+                    'd_out': d_msa,
+                },
+                'template_pair_embedder': {
+                    'd_in': 88,
+                    'v2_d_in': [39, 1, 22, 22, 1, 1, 1, 1],
+                    'd_pair': d_pair,
+                    'd_out': d_template,
+                    'v2_feature': False,
+                },
+                'template_pair_stack': {
+                    'd_template': d_template,
+                    'd_hid_tri_att': 16,
+                    'd_hid_tri_mul': 64,
+                    'num_blocks': 2,
+                    'num_heads': 4,
+                    'pair_transition_n': 2,
+                    'dropout_rate': 0.25,
+                    'inf': 1e9,
+                    'tri_attn_first': True,
+                },
+                'template_pointwise_attention': {
+                    'enabled': True,
+                    'd_template': d_template,
+                    'd_pair': d_pair,
+                    'd_hid': 16,
+                    'num_heads': 4,
+                    'inf': 1e5,
+                },
+                'inf': 1e5,
+                'eps': 1e-6,
+                'enabled': use_templates,
+                'embed_angles': use_templates,
+            },
+            'extra_msa': {
+                'extra_msa_embedder': {
+                    'd_in': 25,
+                    'd_out': d_extra_msa,
+                },
+                'extra_msa_stack': {
+                    'd_msa': d_extra_msa,
+                    'd_pair': d_pair,
+                    'd_hid_msa_att': 8,
+                    'd_hid_opm': 32,
+                    'd_hid_mul': 128,
+                    'd_hid_pair_att': 32,
+                    'num_heads_msa': 8,
+                    'num_heads_pair': 4,
+                    'num_blocks': 4,
+                    'transition_n': 4,
+                    'msa_dropout': 0.15,
+                    'pair_dropout': 0.25,
+                    'inf': 1e9,
+                    'eps': 1e-10,
+                    'outer_product_mean_first': False,
+                },
+                'enabled': True,
+            },
+            'evoformer_stack': {
+                'd_msa': d_msa,
+                'd_pair': d_pair,
+                'd_hid_msa_att': 32,
+                'd_hid_opm': 32,
+                'd_hid_mul': 128,
+                'd_hid_pair_att': 32,
+                'd_single': d_single,
+                'num_heads_msa': 8,
+                'num_heads_pair': 4,
+                'num_blocks': 48,
+                'transition_n': 4,
+                'msa_dropout': 0.15,
+                'pair_dropout': 0.25,
+                'inf': 1e9,
+                'eps': 1e-10,
+                'outer_product_mean_first': False,
+            },
+            'structure_module': {
+                'd_single': d_single,
+                'd_pair': d_pair,
+                'd_ipa': 16,
+                'd_angle': 128,
+                'num_heads_ipa': 12,
+                'num_qk_points': 4,
+                'num_v_points': 8,
+                'dropout_rate': 0.1,
+                'num_blocks': 8,
+                'no_transition_layers': 1,
+                'num_resnet_blocks': 2,
+                'num_angles': 7,
+                'trans_scale_factor': 10,
+                'epsilon': 1e-12,
+                'inf': 1e5,
+                'separate_kv': False,
+                'ipa_bias': True,
+            },
+            'heads': {
+                'plddt': {
+                    'num_bins': 50,
+                    'd_in': d_single,
+                    'd_hid': 128,
+                },
+                'distogram': {
+                    'd_pair': d_pair,
+                    'num_bins': aux_distogram_bins,
+                    'disable_enhance_head': False,
+                },
+                'pae': {
+                    'd_pair': d_pair,
+                    'num_bins': aux_distogram_bins,
+                    'enabled': False,
+                    'iptm_weight': 0.8,
+                    'disable_enhance_head': False,
+                },
+                'masked_msa': {
+                    'd_msa': d_msa,
+                    'd_out': 23,
+                    'disable_enhance_head': False,
+                },
+                'experimentally_resolved': {
+                    'd_single': d_single,
+                    'd_out': 37,
+                    'enabled': False,
+                    'disable_enhance_head': False,
+                },
+            },
+        },
+        'loss': {
+            'distogram': {
+                'min_bin': 2.3125,
+                'max_bin': 21.6875,
+                'num_bins': 64,
+                'eps': 1e-6,
+                'weight': 0.3,
+            },
+            'experimentally_resolved': {
+                'eps': 1e-8,
+                'min_resolution': 0.1,
+                'max_resolution': 3.0,
+                'weight': 0.0,
+            },
+            'fape': {
+                'backbone': {
+                    'clamp_distance': 10.0,
+                    'clamp_distance_between_chains': 30.0,
+                    'loss_unit_distance': 10.0,
+                    'loss_unit_distance_between_chains': 20.0,
+                    'weight': 0.5,
+                    'eps': 1e-4,
+                },
+                'sidechain': {
+                    'clamp_distance': 10.0,
+                    'length_scale': 10.0,
+                    'weight': 0.5,
+                    'eps': 1e-4,
+                },
+                'weight': 1.0,
+            },
+            'plddt': {
+                'min_resolution': 0.1,
+                'max_resolution': 3.0,
+                'cutoff': 15.0,
+                'num_bins': 50,
+                'eps': 1e-10,
+                'weight': 0.01,
+            },
+            'masked_msa': {
+                'eps': 1e-8,
+                'weight': 2.0,
+            },
+            'supervised_chi': {
+                'chi_weight': 0.5,
+                'angle_norm_weight': 0.01,
+                'eps': 1e-6,
+                'weight': 1.0,
+            },
+            'violation': {
+                'violation_tolerance_factor': 12.0,
+                'clash_overlap_tolerance': 1.5,
+                'bond_angle_loss_weight': 0.3,
+                'eps': 1e-6,
+                'weight': 0.0,
+            },
+            'pae': {
+                'max_bin': 31,
+                'num_bins': 64,
+                'min_resolution': 0.1,
+                'max_resolution': 3.0,
+                'eps': 1e-8,
+                'weight': 0.0,
+            },
+            'repr_norm': {
+                'weight': 0.01,
+                'tolerance': 1.0,
+            },
+            'chain_centre_mass': {
+                'weight': 0.0,
+                'eps': 1e-8,
+            },
+        },
+    })
+
+
+def recursive_set(c: mlc.ConfigDict, key: str, value: Any, ignore: str = None):
+    with c.unlocked():
+        for k, v in c.items():
+            if ignore is not None and k == ignore:
+                continue
+            if isinstance(v, mlc.ConfigDict):
+                recursive_set(v, key, value)
+            elif k == key:
+                c[k] = value
+
+
+def model_config(name, train=False):
+    c = copy.deepcopy(base_config())
+
+    def model_2_v2(c):
+        recursive_set(c, 'v2_feature', True)
+        recursive_set(c, 'gumbel_sample', True)
+        c.model.heads.masked_msa.d_out = 22
+        c.model.structure_module.separate_kv = True
+        c.model.structure_module.ipa_bias = False
+        c.model.template.template_angle_embedder.d_in = 34
+        return c
+
+    def multimer(c):
+        recursive_set(c, 'is_multimer', True)
+        recursive_set(c, 'max_extra_msa', 1152)
+        recursive_set(c, 'max_msa_clusters', 128)
+        recursive_set(c, 'v2_feature', True)
+        recursive_set(c, 'gumbel_sample', True)
+        c.model.template.template_angle_embedder.d_in = 34
+        c.model.template.template_pair_stack.tri_attn_first = False
+        c.model.template.template_pointwise_attention.enabled = False
+        c.model.heads.pae.enabled = True
+        # we forget to enable it in our training, so disable it here
+        c.model.heads.pae.disable_enhance_head = True
+        c.model.heads.masked_msa.d_out = 22
+        c.model.structure_module.separate_kv = True
+        c.model.structure_module.ipa_bias = False
+        c.model.structure_module.trans_scale_factor = 20
+        c.loss.pae.weight = 0.1
+        c.model.input_embedder.tf_dim = 21
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+        c.loss.chain_centre_mass.weight = 1.0
+        return c
+
+    if name == 'model_1':
+        pass
+    elif name == 'model_1_ft':
+        recursive_set(c, 'max_extra_msa', 5120)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+    elif name == 'model_1_af2':
+        recursive_set(c, 'max_extra_msa', 5120)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+        c.loss.repr_norm.weight = 0
+        c.model.heads.experimentally_resolved.enabled = True
+        c.loss.experimentally_resolved.weight = 0.01
+        c.globals.alphafold_original_mode = True
+    elif name == 'model_2':
+        pass
+    elif name == 'model_init':
+        pass
+    elif name == 'model_init_af2':
+        c.globals.alphafold_original_mode = True
+        pass
+    elif name == 'model_2_ft':
+        recursive_set(c, 'max_extra_msa', 1024)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+    elif name == 'model_2_af2':
+        recursive_set(c, 'max_extra_msa', 1024)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+        c.loss.repr_norm.weight = 0
+        c.model.heads.experimentally_resolved.enabled = True
+        c.loss.experimentally_resolved.weight = 0.01
+        c.globals.alphafold_original_mode = True
+    elif name == 'model_2_v2':
+        c = model_2_v2(c)
+    elif name == 'model_2_v2_ft':
+        c = model_2_v2(c)
+        recursive_set(c, 'max_extra_msa', 1024)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+    elif name == 'model_3_af2' or name == 'model_4_af2':
+        recursive_set(c, 'max_extra_msa', 5120)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+        c.loss.repr_norm.weight = 0
+        c.model.heads.experimentally_resolved.enabled = True
+        c.loss.experimentally_resolved.weight = 0.01
+        c.globals.alphafold_original_mode = True
+        c.model.template.enabled = False
+        c.model.template.embed_angles = False
+        recursive_set(c, 'use_templates', False)
+        recursive_set(c, 'use_template_torsion_angles', False)
+    elif name == 'model_5_af2':
+        recursive_set(c, 'max_extra_msa', 1024)
+        recursive_set(c, 'max_msa_clusters', 512)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.02
+        c.loss.repr_norm.weight = 0
+        c.model.heads.experimentally_resolved.enabled = True
+        c.loss.experimentally_resolved.weight = 0.01
+        c.globals.alphafold_original_mode = True
+        c.model.template.enabled = False
+        c.model.template.embed_angles = False
+        recursive_set(c, 'use_templates', False)
+        recursive_set(c, 'use_template_torsion_angles', False)
+    elif name == 'multimer':
+        c = multimer(c)
+    elif name == 'multimer_ft':
+        c = multimer(c)
+        recursive_set(c, 'max_extra_msa', 1152)
+        recursive_set(c, 'max_msa_clusters', 256)
+        c.data.train.crop_size = 384
+        c.loss.violation.weight = 0.5
+    elif name == 'multimer_af2':
+        recursive_set(c, 'max_extra_msa', 1152)
+        recursive_set(c, 'max_msa_clusters', 256)
+        recursive_set(c, 'is_multimer', True)
+        recursive_set(c, 'v2_feature', True)
+        recursive_set(c, 'gumbel_sample', True)
+        c.model.template.template_angle_embedder.d_in = 34
+        c.model.template.template_pair_stack.tri_attn_first = False
+        c.model.template.template_pointwise_attention.enabled = False
+        c.model.heads.pae.enabled = True
+        c.model.heads.experimentally_resolved.enabled = True
+        c.model.heads.masked_msa.d_out = 22
+        c.model.structure_module.separate_kv = True
+        c.model.structure_module.ipa_bias = False
+        c.model.structure_module.trans_scale_factor = 20
+        c.loss.pae.weight = 0.1
+        c.loss.violation.weight = 0.5
+        c.loss.experimentally_resolved.weight = 0.01
+        c.model.input_embedder.tf_dim = 21
+        c.globals.alphafold_original_mode = True
+        c.data.train.crop_size = 384
+        c.loss.repr_norm.weight = 0
+        c.loss.chain_centre_mass.weight = 1.0
+        recursive_set(c, 'outer_product_mean_first', True)
+    else:
+        raise ValueError(f'invalid --model-name: {name}.')
+    if train:
+        c.globals.chunk_size = None
+    recursive_set(c, 'inf', 3e4)
+    recursive_set(c, 'eps', 1e-5, 'loss')
+    return c
diff --git a/modelscope/models/science/unifold/data/__init__.py b/modelscope/models/science/unifold/data/__init__.py
new file mode 100644
index 00000000..9821d212
--- /dev/null
+++ b/modelscope/models/science/unifold/data/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data pipeline for model features."""
diff --git a/modelscope/models/science/unifold/data/data_ops.py b/modelscope/models/science/unifold/data/data_ops.py
new file mode 100644
index 00000000..637aa0cd
--- /dev/null
+++ b/modelscope/models/science/unifold/data/data_ops.py
@@ -0,0 +1,1397 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import itertools
+from functools import reduce, wraps
+from operator import add
+from typing import List, MutableMapping, Optional
+
+import numpy as np
+import torch
+from unicore.data import data_utils
+from unicore.utils import batched_gather, one_hot, tensor_tree_map, tree_map
+
+from modelscope.models.science.unifold.config import (N_EXTRA_MSA, N_MSA,
+                                                      N_RES, N_TPL)
+from modelscope.models.science.unifold.data import residue_constants as rc
+from modelscope.models.science.unifold.modules.frame import Frame, Rotation
+
+NumpyDict = MutableMapping[str, np.ndarray]
+TorchDict = MutableMapping[str, np.ndarray]
+
+protein: TorchDict
+
+MSA_FEATURE_NAMES = [
+    'msa',
+    'deletion_matrix',
+    'msa_mask',
+    'msa_row_mask',
+    'bert_mask',
+    'true_msa',
+    'msa_chains',
+]
+
+
+def cast_to_64bit_ints(protein):
+    # We keep all ints as int64
+    for k, v in protein.items():
+        if k.endswith('_mask'):
+            protein[k] = v.type(torch.float32)
+        elif v.dtype in (torch.int32, torch.uint8, torch.int8):
+            protein[k] = v.type(torch.int64)
+
+    return protein
+
+
+def make_seq_mask(protein):
+    protein['seq_mask'] = torch.ones(
+        protein['aatype'].shape, dtype=torch.float32)
+    return protein
+
+
+def make_template_mask(protein):
+    protein['template_mask'] = torch.ones(
+        protein['template_aatype'].shape[0], dtype=torch.float32)
+    return protein
+
+
+def curry1(f):
+    """Supply all arguments but the first."""
+
+    @wraps(f)
+    def fc(*args, **kwargs):
+        return lambda x: f(x, *args, **kwargs)
+
+    return fc
+
+
+def correct_msa_restypes(protein):
+    """Correct MSA restype to have the same order as rc."""
+    protein['msa'] = protein['msa'].long()
+    new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+    new_order = (
+        torch.tensor(new_order_list, dtype=torch.int8).unsqueeze(-1).expand(
+            -1, protein['msa'].shape[1]))
+    protein['msa'] = torch.gather(new_order, 0, protein['msa']).long()
+
+    return protein
+
+
+def squeeze_features(protein):
+    """Remove singleton and repeated dimensions in protein features."""
+    if len(protein['aatype'].shape) == 2:
+        protein['aatype'] = torch.argmax(protein['aatype'], dim=-1)
+    if 'resolution' in protein and len(protein['resolution'].shape) == 1:
+        # use tensor for resolution
+        protein['resolution'] = protein['resolution'][0]
+    for k in [
+            'domain_name',
+            'msa',
+            'num_alignments',
+            'seq_length',
+            'sequence',
+            'superfamily',
+            'deletion_matrix',
+            'between_segment_residues',
+            'residue_index',
+            'template_all_atom_mask',
+    ]:
+        if k in protein and len(protein[k].shape):
+            final_dim = protein[k].shape[-1]
+            if isinstance(final_dim, int) and final_dim == 1:
+                if torch.is_tensor(protein[k]):
+                    protein[k] = torch.squeeze(protein[k], dim=-1)
+                else:
+                    protein[k] = np.squeeze(protein[k], axis=-1)
+
+    for k in ['seq_length', 'num_alignments']:
+        if k in protein and len(protein[k].shape):
+            protein[k] = protein[k][0]
+
+    return protein
+
+
+@curry1
+def randomly_replace_msa_with_unknown(protein, replace_proportion):
+    """Replace a portion of the MSA with 'X'."""
+    if replace_proportion > 0.0:
+        msa_mask = np.random.rand(protein['msa'].shape) < replace_proportion
+        x_idx = 20
+        gap_idx = 21
+        msa_mask = torch.logical_and(msa_mask, protein['msa'] != gap_idx)
+        protein['msa'] = torch.where(msa_mask,
+                                     torch.ones_like(protein['msa']) * x_idx,
+                                     protein['msa'])
+        aatype_mask = np.random.rand(
+            protein['aatype'].shape) < replace_proportion
+
+        protein['aatype'] = torch.where(
+            aatype_mask,
+            torch.ones_like(protein['aatype']) * x_idx,
+            protein['aatype'],
+        )
+    return protein
+
+
+def gumbel_noise(shape):
+    """Generate Gumbel Noise of given Shape.
+    This generates samples from Gumbel(0, 1).
+    Args:
+        shape: Shape of noise to return.
+    Returns:
+        Gumbel noise of given shape.
+    """
+    epsilon = 1e-6
+    uniform_noise = torch.from_numpy(np.random.uniform(0, 1, shape))
+    gumbel = -torch.log(-torch.log(uniform_noise + epsilon) + epsilon)
+    return gumbel
+
+
+def gumbel_max_sample(logits):
+    """Samples from a probability distribution given by 'logits'.
+    This uses Gumbel-max trick to implement the sampling in an efficient manner.
+    Args:
+        logits: Logarithm of probabilities to sample from, probabilities can be
+        unnormalized.
+    Returns:
+        Sample from logprobs in one-hot form.
+    """
+    z = gumbel_noise(logits.shape)
+    return torch.argmax(logits + z, dim=-1)
+
+
+def gumbel_argsort_sample_idx(logits):
+    """Samples with replacement from a distribution given by 'logits'.
+    This uses Gumbel trick to implement the sampling an efficient manner. For a
+    distribution over k items this samples k times without replacement, so this
+    is effectively sampling a random permutation with probabilities over the
+    permutations derived from the logprobs.
+    Args:
+        logits: Logarithm of probabilities to sample from, probabilities can be
+        unnormalized.
+    Returns:
+        Sample from logprobs in index
+    """
+    z = gumbel_noise(logits.shape)
+    return torch.argsort(logits + z, dim=-1, descending=True)
+
+
+def uniform_permutation(num_seq):
+    shuffled = torch.from_numpy(np.random.permutation(num_seq - 1) + 1)
+    return torch.cat((torch.tensor([0]), shuffled), dim=0)
+
+
+def gumbel_permutation(msa_mask, msa_chains=None):
+    has_msa = torch.sum(msa_mask.long(), dim=-1) > 0
+    # default logits is zero
+    logits = torch.zeros_like(has_msa, dtype=torch.float32)
+    logits[~has_msa] = -1e6
+    # one sample only
+    assert len(logits.shape) == 1
+    # skip first row
+    logits = logits[1:]
+    has_msa = has_msa[1:]
+    if logits.shape[0] == 0:
+        return torch.tensor([0])
+    if msa_chains is not None:
+        # skip first row
+        msa_chains = msa_chains[1:].reshape(-1)
+        msa_chains[~has_msa] = 0
+        keys, counts = np.unique(msa_chains, return_counts=True)
+        num_has_msa = has_msa.sum()
+        num_pair = (msa_chains == 1).sum()
+        num_unpair = num_has_msa - num_pair
+        num_chains = (keys > 1).sum()
+        logits[has_msa] = 1.0 / (num_has_msa + 1e-6)
+        logits[~has_msa] = 0
+        for k in keys:
+            if k > 1:
+                cur_mask = msa_chains == k
+                cur_cnt = cur_mask.sum()
+                if cur_cnt > 0:
+                    logits[cur_mask] *= num_unpair / (num_chains * cur_cnt)
+        logits = torch.log(logits + 1e-6)
+    shuffled = gumbel_argsort_sample_idx(logits) + 1
+    return torch.cat((torch.tensor([0]), shuffled), dim=0)
+
+
+@curry1
+def sample_msa(protein,
+               max_seq,
+               keep_extra,
+               gumbel_sample=False,
+               biased_msa_by_chain=False):
+    """Sample MSA randomly, remaining sequences are stored are stored as `extra_*`."""
+    num_seq = protein['msa'].shape[0]
+    num_sel = min(max_seq, num_seq)
+    if not gumbel_sample:
+        index_order = uniform_permutation(num_seq)
+    else:
+        msa_chains = (
+            protein['msa_chains'] if
+            (biased_msa_by_chain and 'msa_chains' in protein) else None)
+        index_order = gumbel_permutation(protein['msa_mask'], msa_chains)
+    num_sel = min(max_seq, num_seq)
+    sel_seq, not_sel_seq = torch.split(index_order,
+                                       [num_sel, num_seq - num_sel])
+
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            if keep_extra:
+                protein['extra_' + k] = torch.index_select(
+                    protein[k], 0, not_sel_seq)
+            protein[k] = torch.index_select(protein[k], 0, sel_seq)
+
+    return protein
+
+
+@curry1
+def sample_msa_distillation(protein, max_seq):
+    if 'is_distillation' in protein and protein['is_distillation'] == 1:
+        protein = sample_msa(max_seq, keep_extra=False)(protein)
+    return protein
+
+
+@curry1
+def random_delete_msa(protein, config):
+    # to reduce the cost of msa features
+    num_seq = protein['msa'].shape[0]
+    seq_len = protein['msa'].shape[1]
+    max_seq = config.max_msa_entry // seq_len
+    if num_seq > max_seq:
+        keep_index = (
+            torch.from_numpy(
+                np.random.choice(num_seq - 1, max_seq - 1,
+                                 replace=False)).long() + 1)
+        keep_index = torch.sort(keep_index)[0]
+        keep_index = torch.cat((torch.tensor([0]), keep_index), dim=0)
+        for k in MSA_FEATURE_NAMES:
+            if k in protein:
+                protein[k] = torch.index_select(protein[k], 0, keep_index)
+    return protein
+
+
+@curry1
+def crop_extra_msa(protein, max_extra_msa):
+    num_seq = protein['extra_msa'].shape[0]
+    num_sel = min(max_extra_msa, num_seq)
+    select_indices = torch.from_numpy(np.random.permutation(num_seq)[:num_sel])
+    for k in MSA_FEATURE_NAMES:
+        if 'extra_' + k in protein:
+            protein['extra_' + k] = torch.index_select(protein['extra_' + k],
+                                                       0, select_indices)
+
+    return protein
+
+
+def delete_extra_msa(protein):
+    for k in MSA_FEATURE_NAMES:
+        if 'extra_' + k in protein:
+            del protein['extra_' + k]
+    return protein
+
+
+@curry1
+def block_delete_msa(protein, config):
+    if 'is_distillation' in protein and protein['is_distillation'] == 1:
+        return protein
+    num_seq = protein['msa'].shape[0]
+    if num_seq <= config.min_num_msa:
+        return protein
+    block_num_seq = torch.floor(
+        torch.tensor(num_seq, dtype=torch.float32)
+        * config.msa_fraction_per_block).to(torch.int32)
+
+    if config.randomize_num_blocks:
+        nb = np.random.randint(0, config.num_blocks + 1)
+    else:
+        nb = config.num_blocks
+
+    del_block_starts = torch.from_numpy(np.random.randint(0, num_seq, [nb]))
+    del_blocks = del_block_starts[:, None] + torch.arange(0, block_num_seq)
+    del_blocks = torch.clip(del_blocks, 0, num_seq - 1)
+    del_indices = torch.unique(del_blocks.view(-1))
+    # add zeros to ensure cnt_zero > 1
+    combined = torch.hstack((torch.arange(0, num_seq)[None], del_indices[None],
+                             torch.zeros(2)[None])).long()
+    uniques, counts = combined.unique(return_counts=True)
+    difference = uniques[counts == 1]
+    # intersection = uniques[counts > 1]
+    keep_indices = difference.view(-1)
+    keep_indices = torch.hstack(
+        [torch.zeros(1).long()[None], keep_indices[None]]).view(-1)
+    assert int(keep_indices[0]) == 0
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            protein[k] = torch.index_select(protein[k], 0, index=keep_indices)
+    return protein
+
+
+@curry1
+def nearest_neighbor_clusters(protein, gap_agreement_weight=0.0):
+    weights = torch.cat(
+        [torch.ones(21), gap_agreement_weight * torch.ones(1),
+         torch.zeros(1)],
+        0,
+    )
+
+    msa_one_hot = one_hot(protein['msa'], 23)
+    sample_one_hot = protein['msa_mask'][:, :, None] * msa_one_hot
+    extra_msa_one_hot = one_hot(protein['extra_msa'], 23)
+    extra_one_hot = protein['extra_msa_mask'][:, :, None] * extra_msa_one_hot
+
+    num_seq, num_res, _ = sample_one_hot.shape
+    extra_num_seq, _, _ = extra_one_hot.shape
+
+    # Compute tf.einsum('mrc,nrc,c->mn', sample_one_hot, extra_one_hot, weights)
+    # in an optimized fashion to avoid possible memory or computation blowup.
+    a = extra_one_hot.view(extra_num_seq, num_res * 23)
+    b = (sample_one_hot * weights).view(num_seq, num_res * 23).transpose(0, 1)
+    agreement = a @ b
+    # Assign each sequence in the extra sequences to the closest MSA sample
+    protein['extra_cluster_assignment'] = torch.argmax(agreement, dim=1).long()
+
+    return protein
+
+
+def unsorted_segment_sum(data, segment_ids, num_segments):
+    assert len(
+        segment_ids.shape) == 1 and segment_ids.shape[0] == data.shape[0]
+    segment_ids = segment_ids.view(segment_ids.shape[0],
+                                   *((1, ) * len(data.shape[1:])))
+    segment_ids = segment_ids.expand(data.shape)
+    shape = [num_segments] + list(data.shape[1:])
+    tensor = torch.zeros(*shape).scatter_add_(0, segment_ids, data.float())
+    tensor = tensor.type(data.dtype)
+    return tensor
+
+
+def summarize_clusters(protein):
+    """Produce profile and deletion_matrix_mean within each cluster."""
+    num_seq = protein['msa'].shape[0]
+
+    def csum(x):
+        return unsorted_segment_sum(x, protein['extra_cluster_assignment'],
+                                    num_seq)
+
+    mask = protein['extra_msa_mask']
+    mask_counts = 1e-6 + protein['msa_mask'] + csum(mask)  # Include center
+
+    # TODO: this line is very slow
+    msa_sum = csum(mask[:, :, None] * one_hot(protein['extra_msa'], 23))
+    msa_sum += one_hot(protein['msa'], 23)  # Original sequence
+    protein['cluster_profile'] = msa_sum / mask_counts[:, :, None]
+    del msa_sum
+
+    del_sum = csum(mask * protein['extra_deletion_matrix'])
+    del_sum += protein['deletion_matrix']  # Original sequence
+    protein['cluster_deletion_mean'] = del_sum / mask_counts
+    del del_sum
+
+    return protein
+
+
+@curry1
+def nearest_neighbor_clusters_v2(batch, gap_agreement_weight=0.0):
+    """Assign each extra MSA sequence to its nearest neighbor in sampled MSA."""
+
+    # Determine how much weight we assign to each agreement.  In theory, we could
+    # use a full blosum matrix here, but right now let's just down-weight gap
+    # agreement because it could be spurious.
+    # Never put weight on agreeing on BERT mask.
+
+    weights = torch.tensor(
+        [1.0] * 21 + [gap_agreement_weight] + [0.0], dtype=torch.float32)
+
+    msa_mask = batch['msa_mask']
+    extra_mask = batch['extra_msa_mask']
+    msa_one_hot = one_hot(batch['msa'], 23)
+    extra_one_hot = one_hot(batch['extra_msa'], 23)
+
+    msa_one_hot_masked = msa_mask[:, :, None] * msa_one_hot
+    extra_one_hot_masked = extra_mask[:, :, None] * extra_one_hot
+
+    t1 = weights * msa_one_hot_masked
+    t1 = t1.view(t1.shape[0], t1.shape[1] * t1.shape[2])
+    t2 = extra_one_hot_masked.view(
+        extra_one_hot.shape[0],
+        extra_one_hot.shape[1] * extra_one_hot.shape[2])
+    agreement = t1 @ t2.T
+
+    cluster_assignment = torch.nn.functional.softmax(1e3 * agreement, dim=0)
+    cluster_assignment *= torch.einsum('mr, nr->mn', msa_mask, extra_mask)
+
+    cluster_count = torch.sum(cluster_assignment, dim=-1)
+    cluster_count += 1.0  # We always include the sequence itself.
+
+    msa_sum = torch.einsum('nm, mrc->nrc', cluster_assignment,
+                           extra_one_hot_masked)
+    msa_sum += msa_one_hot_masked
+
+    cluster_profile = msa_sum / cluster_count[:, None, None]
+
+    deletion_matrix = batch['deletion_matrix']
+    extra_deletion_matrix = batch['extra_deletion_matrix']
+
+    del_sum = torch.einsum('nm, mc->nc', cluster_assignment,
+                           extra_mask * extra_deletion_matrix)
+    del_sum += deletion_matrix  # Original sequence.
+    cluster_deletion_mean = del_sum / cluster_count[:, None]
+    batch['cluster_profile'] = cluster_profile
+    batch['cluster_deletion_mean'] = cluster_deletion_mean
+
+    return batch
+
+
+def make_msa_mask(protein):
+    """Mask features are all ones, but will later be zero-padded."""
+    if 'msa_mask' not in protein:
+        protein['msa_mask'] = torch.ones(
+            protein['msa'].shape, dtype=torch.float32)
+    protein['msa_row_mask'] = torch.ones((protein['msa'].shape[0]),
+                                         dtype=torch.float32)
+    return protein
+
+
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_mask):
+    """Create pseudo beta features."""
+    if aatype.shape[0] > 0:
+        is_gly = torch.eq(aatype, rc.restype_order['G'])
+        ca_idx = rc.atom_order['CA']
+        cb_idx = rc.atom_order['CB']
+        pseudo_beta = torch.where(
+            torch.tile(is_gly[..., None], [1] * len(is_gly.shape) + [3]),
+            all_atom_positions[..., ca_idx, :],
+            all_atom_positions[..., cb_idx, :],
+        )
+    else:
+        pseudo_beta = all_atom_positions.new_zeros(*aatype.shape, 3)
+    if all_atom_mask is not None:
+        if aatype.shape[0] > 0:
+            pseudo_beta_mask = torch.where(is_gly, all_atom_mask[..., ca_idx],
+                                           all_atom_mask[..., cb_idx])
+        else:
+            pseudo_beta_mask = torch.zeros_like(aatype).float()
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+
+
+@curry1
+def make_pseudo_beta(protein, prefix=''):
+    """Create pseudo-beta (alpha for glycine) position and mask."""
+    assert prefix in ['', 'template_']
+    (
+        protein[prefix + 'pseudo_beta'],
+        protein[prefix + 'pseudo_beta_mask'],
+    ) = pseudo_beta_fn(
+        protein['template_aatype' if prefix else 'aatype'],
+        protein[prefix + 'all_atom_positions'],
+        protein['template_all_atom_mask' if prefix else 'all_atom_mask'],
+    )
+    return protein
+
+
+@curry1
+def add_constant_field(protein, key, value):
+    protein[key] = torch.tensor(value)
+    return protein
+
+
+def shaped_categorical(probs, epsilon=1e-10):
+    ds = probs.shape
+    num_classes = ds[-1]
+    probs = torch.reshape(probs + epsilon, [-1, num_classes])
+    gen = torch.Generator()
+    gen.manual_seed(np.random.randint(65535))
+    counts = torch.multinomial(probs, 1, generator=gen)
+    return torch.reshape(counts, ds[:-1])
+
+
+def make_hhblits_profile(protein):
+    """Compute the HHblits MSA profile if not already present."""
+    if 'hhblits_profile' in protein:
+        return protein
+
+    # Compute the profile for every residue (over all MSA sequences).
+    msa_one_hot = one_hot(protein['msa'], 22)
+
+    protein['hhblits_profile'] = torch.mean(msa_one_hot, dim=0)
+    return protein
+
+
+def make_msa_profile(batch):
+    """Compute the MSA profile."""
+    # Compute the profile for every residue (over all MSA sequences).
+    oh = one_hot(batch['msa'], 22)
+    mask = batch['msa_mask'][:, :, None]
+    oh *= mask
+    return oh.sum(dim=0) / (mask.sum(dim=0) + 1e-10)
+
+
+def make_hhblits_profile_v2(protein):
+    """Compute the HHblits MSA profile if not already present."""
+    if 'hhblits_profile' in protein:
+        return protein
+    protein['hhblits_profile'] = make_msa_profile(protein)
+    return protein
+
+
+def share_mask_by_entity(mask_position, protein):  # new in unifold
+    if 'num_sym' not in protein:
+        return mask_position
+    entity_id = protein['entity_id']
+    sym_id = protein['sym_id']
+    num_sym = protein['num_sym']
+    unique_entity_ids = entity_id.unique()
+    first_sym_mask = sym_id == 1
+    for cur_entity_id in unique_entity_ids:
+        cur_entity_mask = entity_id == cur_entity_id
+        cur_num_sym = int(num_sym[cur_entity_mask][0])
+        if cur_num_sym > 1:
+            cur_sym_mask = first_sym_mask & cur_entity_mask
+            cur_sym_bert_mask = mask_position[:, cur_sym_mask]
+            mask_position[:, cur_entity_mask] = cur_sym_bert_mask.repeat(
+                1, cur_num_sym)
+    return mask_position
+
+
+@curry1
+def make_masked_msa(protein,
+                    config,
+                    replace_fraction,
+                    gumbel_sample=False,
+                    share_mask=False):
+    """Create data for BERT on raw MSA."""
+    # Add a random amino acid uniformly.
+    random_aa = torch.tensor([0.05] * 20 + [0.0, 0.0], dtype=torch.float32)
+
+    categorical_probs = (
+        config.uniform_prob * random_aa
+        + config.profile_prob * protein['hhblits_profile']
+        + config.same_prob * one_hot(protein['msa'], 22))
+
+    # Put all remaining probability on [MASK] which is a new column
+    pad_shapes = list(
+        reduce(add, [(0, 0) for _ in range(len(categorical_probs.shape))]))
+    pad_shapes[1] = 1
+    mask_prob = 1.0 - config.profile_prob - config.same_prob - config.uniform_prob
+    assert mask_prob >= 0.0
+    categorical_probs = torch.nn.functional.pad(
+        categorical_probs, pad_shapes, value=mask_prob)
+    sh = protein['msa'].shape
+    mask_position = torch.from_numpy(np.random.rand(*sh) < replace_fraction)
+    mask_position &= protein['msa_mask'].bool()
+
+    if 'bert_mask' in protein:
+        mask_position &= protein['bert_mask'].bool()
+
+    if share_mask:
+        mask_position = share_mask_by_entity(mask_position, protein)
+    if gumbel_sample:
+        logits = torch.log(categorical_probs + 1e-6)
+        bert_msa = gumbel_max_sample(logits)
+    else:
+        bert_msa = shaped_categorical(categorical_probs)
+    bert_msa = torch.where(mask_position, bert_msa, protein['msa'])
+    bert_msa *= protein['msa_mask'].long()
+
+    # Mix real and masked MSA
+    protein['bert_mask'] = mask_position.to(torch.float32)
+    protein['true_msa'] = protein['msa']
+    protein['msa'] = bert_msa
+
+    return protein
+
+
+@curry1
+def make_fixed_size(
+    protein,
+    shape_schema,
+    msa_cluster_size,
+    extra_msa_size,
+    num_res=0,
+    num_templates=0,
+):
+    """Guess at the MSA and sequence dimension to make fixed size."""
+
+    def get_pad_size(cur_size, multiplier=4):
+        return max(multiplier,
+                   ((cur_size + multiplier - 1) // multiplier) * multiplier)
+
+    if num_res is not None:
+        input_num_res = (
+            protein['aatype'].shape[0]
+            if 'aatype' in protein else protein['msa_mask'].shape[1])
+        if input_num_res != num_res:
+            num_res = get_pad_size(input_num_res, 4)
+    if 'extra_msa_mask' in protein:
+        input_extra_msa_size = protein['extra_msa_mask'].shape[0]
+        if input_extra_msa_size != extra_msa_size:
+            extra_msa_size = get_pad_size(input_extra_msa_size, 8)
+    pad_size_map = {
+        N_RES: num_res,
+        N_MSA: msa_cluster_size,
+        N_EXTRA_MSA: extra_msa_size,
+        N_TPL: num_templates,
+    }
+
+    for k, v in protein.items():
+        # Don't transfer this to the accelerator.
+        if k == 'extra_cluster_assignment':
+            continue
+        shape = list(v.shape)
+        schema = shape_schema[k]
+        msg = 'Rank mismatch between shape and shape schema for'
+        assert len(shape) == len(schema), f'{msg} {k}: {shape} vs {schema}'
+        pad_size = [
+            pad_size_map.get(s2, None) or s1
+            for (s1, s2) in zip(shape, schema)
+        ]
+
+        padding = [(0, p - v.shape[i]) for i, p in enumerate(pad_size)]
+        padding.reverse()
+        padding = list(itertools.chain(*padding))
+        if padding:
+            protein[k] = torch.nn.functional.pad(v, padding)
+            protein[k] = torch.reshape(protein[k], pad_size)
+
+    return protein
+
+
+def make_target_feat(protein):
+    """Create and concatenate MSA features."""
+    protein['aatype'] = protein['aatype'].long()
+
+    if 'between_segment_residues' in protein:
+        has_break = torch.clip(
+            protein['between_segment_residues'].to(torch.float32), 0, 1)
+    else:
+        has_break = torch.zeros_like(protein['aatype'], dtype=torch.float32)
+        if 'asym_len' in protein:
+            asym_len = protein['asym_len']
+            entity_ends = torch.cumsum(asym_len, dim=-1)[:-1]
+            has_break[entity_ends] = 1.0
+        has_break = has_break.float()
+    aatype_1hot = one_hot(protein['aatype'], 21)
+    target_feat = [
+        torch.unsqueeze(has_break, dim=-1),
+        aatype_1hot,  # Everyone gets the original sequence.
+    ]
+    protein['target_feat'] = torch.cat(target_feat, dim=-1)
+    return protein
+
+
+def make_msa_feat(protein):
+    """Create and concatenate MSA features."""
+    msa_1hot = one_hot(protein['msa'], 23)
+    has_deletion = torch.clip(protein['deletion_matrix'], 0.0, 1.0)
+    deletion_value = torch.atan(
+        protein['deletion_matrix'] / 3.0) * (2.0 / np.pi)
+    msa_feat = [
+        msa_1hot,
+        torch.unsqueeze(has_deletion, dim=-1),
+        torch.unsqueeze(deletion_value, dim=-1),
+    ]
+    if 'cluster_profile' in protein:
+        deletion_mean_value = torch.atan(
+            protein['cluster_deletion_mean'] / 3.0) * (2.0 / np.pi)
+        msa_feat.extend([
+            protein['cluster_profile'],
+            torch.unsqueeze(deletion_mean_value, dim=-1),
+        ])
+
+    if 'extra_deletion_matrix' in protein:
+        protein['extra_msa_has_deletion'] = torch.clip(
+            protein['extra_deletion_matrix'], 0.0, 1.0)
+        protein['extra_msa_deletion_value'] = torch.atan(
+            protein['extra_deletion_matrix'] / 3.0) * (2.0 / np.pi)
+
+    protein['msa_feat'] = torch.cat(msa_feat, dim=-1)
+    return protein
+
+
+def make_msa_feat_v2(batch):
+    """Create and concatenate MSA features."""
+    msa_1hot = one_hot(batch['msa'], 23)
+    deletion_matrix = batch['deletion_matrix']
+    has_deletion = torch.clip(deletion_matrix, 0.0, 1.0)[..., None]
+    deletion_value = (torch.atan(deletion_matrix / 3.0) * (2.0 / np.pi))[...,
+                                                                         None]
+
+    deletion_mean_value = (
+        torch.arctan(batch['cluster_deletion_mean'] / 3.0) *  # noqa W504
+        (2.0 / np.pi))[..., None]
+
+    msa_feat = [
+        msa_1hot,
+        has_deletion,
+        deletion_value,
+        batch['cluster_profile'],
+        deletion_mean_value,
+    ]
+    batch['msa_feat'] = torch.concat(msa_feat, dim=-1)
+    return batch
+
+
+@curry1
+def make_extra_msa_feat(batch, num_extra_msa):
+    # 23 = 20 amino acids + 'X' for unknown + gap + bert mask
+    extra_msa = batch['extra_msa'][:num_extra_msa]
+    deletion_matrix = batch['extra_deletion_matrix'][:num_extra_msa]
+    has_deletion = torch.clip(deletion_matrix, 0.0, 1.0)
+    deletion_value = torch.atan(deletion_matrix / 3.0) * (2.0 / np.pi)
+    extra_msa_mask = batch['extra_msa_mask'][:num_extra_msa]
+    batch['extra_msa'] = extra_msa
+    batch['extra_msa_mask'] = extra_msa_mask
+    batch['extra_msa_has_deletion'] = has_deletion
+    batch['extra_msa_deletion_value'] = deletion_value
+    return batch
+
+
+@curry1
+def select_feat(protein, feature_list):
+    return {k: v for k, v in protein.items() if k in feature_list}
+
+
+def make_atom14_masks(protein):
+    """Construct denser atom positions (14 dimensions instead of 37)."""
+
+    if 'atom14_atom_exists' in protein:  # lazy move
+        return protein
+
+    restype_atom14_to_atom37 = torch.tensor(
+        rc.restype_atom14_to_atom37,
+        dtype=torch.int64,
+        device=protein['aatype'].device,
+    )
+    restype_atom37_to_atom14 = torch.tensor(
+        rc.restype_atom37_to_atom14,
+        dtype=torch.int64,
+        device=protein['aatype'].device,
+    )
+    restype_atom14_mask = torch.tensor(
+        rc.restype_atom14_mask,
+        dtype=torch.float32,
+        device=protein['aatype'].device,
+    )
+    restype_atom37_mask = torch.tensor(
+        rc.restype_atom37_mask,
+        dtype=torch.float32,
+        device=protein['aatype'].device)
+
+    protein_aatype = protein['aatype'].long()
+    protein['residx_atom14_to_atom37'] = restype_atom14_to_atom37[
+        protein_aatype].long()
+    protein['residx_atom37_to_atom14'] = restype_atom37_to_atom14[
+        protein_aatype].long()
+    protein['atom14_atom_exists'] = restype_atom14_mask[protein_aatype]
+    protein['atom37_atom_exists'] = restype_atom37_mask[protein_aatype]
+
+    return protein
+
+
+def make_atom14_masks_np(batch):
+    batch = tree_map(lambda n: torch.tensor(n), batch, np.ndarray)
+    out = make_atom14_masks(batch)
+    out = tensor_tree_map(lambda t: np.array(t), out)
+    return out
+
+
+def make_atom14_positions(protein):
+    """Constructs denser atom positions (14 dimensions instead of 37)."""
+    protein['aatype'] = protein['aatype'].long()
+    protein['all_atom_mask'] = protein['all_atom_mask'].float()
+    protein['all_atom_positions'] = protein['all_atom_positions'].float()
+    residx_atom14_mask = protein['atom14_atom_exists']
+    residx_atom14_to_atom37 = protein['residx_atom14_to_atom37']
+
+    # Create a mask for known ground truth positions.
+    residx_atom14_gt_mask = residx_atom14_mask * batched_gather(
+        protein['all_atom_mask'],
+        residx_atom14_to_atom37,
+        dim=-1,
+        num_batch_dims=len(protein['all_atom_mask'].shape[:-1]),
+    )
+
+    # Gather the ground truth positions.
+    residx_atom14_gt_positions = residx_atom14_gt_mask[..., None] * (
+        batched_gather(
+            protein['all_atom_positions'],
+            residx_atom14_to_atom37,
+            dim=-2,
+            num_batch_dims=len(protein['all_atom_positions'].shape[:-2]),
+        ))
+
+    protein['atom14_atom_exists'] = residx_atom14_mask
+    protein['atom14_gt_exists'] = residx_atom14_gt_mask
+    protein['atom14_gt_positions'] = residx_atom14_gt_positions
+
+    renaming_matrices = torch.tensor(
+        rc.renaming_matrices,
+        dtype=protein['all_atom_mask'].dtype,
+        device=protein['all_atom_mask'].device,
+    )
+
+    # Pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14).
+    renaming_transform = renaming_matrices[protein['aatype']]
+
+    # Apply it to the ground truth positions. shape (num_res, 14, 3).
+    alternative_gt_positions = torch.einsum('...rac,...rab->...rbc',
+                                            residx_atom14_gt_positions,
+                                            renaming_transform)
+    protein['atom14_alt_gt_positions'] = alternative_gt_positions
+
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position).
+    alternative_gt_mask = torch.einsum('...ra,...rab->...rb',
+                                       residx_atom14_gt_mask,
+                                       renaming_transform)
+    protein['atom14_alt_gt_exists'] = alternative_gt_mask
+
+    restype_atom14_is_ambiguous = torch.tensor(
+        rc.restype_atom14_is_ambiguous,
+        dtype=protein['all_atom_mask'].dtype,
+        device=protein['all_atom_mask'].device,
+    )
+    # From this create an ambiguous_mask for the given sequence.
+    protein['atom14_atom_is_ambiguous'] = restype_atom14_is_ambiguous[
+        protein['aatype']]
+
+    return protein
+
+
+def atom37_to_frames(protein, eps=1e-8):
+    # TODO: extract common part and put them into residue constants.
+    aatype = protein['aatype']
+    all_atom_positions = protein['all_atom_positions']
+    all_atom_mask = protein['all_atom_mask']
+
+    batch_dims = len(aatype.shape[:-1])
+
+    restype_rigidgroup_base_atom_names = np.full([21, 8, 3], '', dtype=object)
+    restype_rigidgroup_base_atom_names[:, 0, :] = ['C', 'CA', 'N']
+    restype_rigidgroup_base_atom_names[:, 3, :] = ['CA', 'C', 'O']
+
+    for restype, restype_letter in enumerate(rc.restypes):
+        resname = rc.restype_1to3[restype_letter]
+        for chi_idx in range(4):
+            if rc.chi_angles_mask[restype][chi_idx]:
+                names = rc.chi_angles_atoms[resname][chi_idx]
+                restype_rigidgroup_base_atom_names[restype,
+                                                   chi_idx + 4, :] = names[1:]
+
+    restype_rigidgroup_mask = all_atom_mask.new_zeros(
+        (*aatype.shape[:-1], 21, 8), )
+    restype_rigidgroup_mask[..., 0] = 1
+    restype_rigidgroup_mask[..., 3] = 1
+    restype_rigidgroup_mask[..., :20,
+                            4:] = all_atom_mask.new_tensor(rc.chi_angles_mask)
+
+    lookuptable = rc.atom_order.copy()
+    lookuptable[''] = 0
+    lookup = np.vectorize(lambda x: lookuptable[x])
+    restype_rigidgroup_base_atom37_idx = lookup(
+        restype_rigidgroup_base_atom_names, )
+    restype_rigidgroup_base_atom37_idx = aatype.new_tensor(
+        restype_rigidgroup_base_atom37_idx, )
+    restype_rigidgroup_base_atom37_idx = restype_rigidgroup_base_atom37_idx.view(
+        *((1, ) * batch_dims), *restype_rigidgroup_base_atom37_idx.shape)
+
+    residx_rigidgroup_base_atom37_idx = batched_gather(
+        restype_rigidgroup_base_atom37_idx,
+        aatype,
+        dim=-3,
+        num_batch_dims=batch_dims,
+    )
+
+    base_atom_pos = batched_gather(
+        all_atom_positions,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-2,
+        num_batch_dims=len(all_atom_positions.shape[:-2]),
+    )
+
+    gt_frames = Frame.from_3_points(
+        p_neg_x_axis=base_atom_pos[..., 0, :],
+        origin=base_atom_pos[..., 1, :],
+        p_xy_plane=base_atom_pos[..., 2, :],
+        eps=eps,
+    )
+
+    group_exists = batched_gather(
+        restype_rigidgroup_mask,
+        aatype,
+        dim=-2,
+        num_batch_dims=batch_dims,
+    )
+
+    gt_atoms_exist = batched_gather(
+        all_atom_mask,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-1,
+        num_batch_dims=len(all_atom_mask.shape[:-1]),
+    )
+    gt_exists = torch.min(gt_atoms_exist, dim=-1)[0] * group_exists
+
+    rots = torch.eye(3, dtype=all_atom_mask.dtype, device=aatype.device)
+    rots = torch.tile(rots, (*((1, ) * batch_dims), 8, 1, 1))
+    rots[..., 0, 0, 0] = -1
+    rots[..., 0, 2, 2] = -1
+    rots = Rotation(mat=rots)
+
+    gt_frames = gt_frames.compose(Frame(rots, None))
+
+    restype_rigidgroup_is_ambiguous = all_atom_mask.new_zeros(
+        *((1, ) * batch_dims), 21, 8)
+    restype_rigidgroup_rots = torch.eye(
+        3, dtype=all_atom_mask.dtype, device=aatype.device)
+    restype_rigidgroup_rots = torch.tile(
+        restype_rigidgroup_rots,
+        (*((1, ) * batch_dims), 21, 8, 1, 1),
+    )
+
+    for resname, _ in rc.residue_atom_renaming_swaps.items():
+        restype = rc.restype_order[rc.restype_3to1[resname]]
+        chi_idx = int(sum(rc.chi_angles_mask[restype]) - 1)
+        restype_rigidgroup_is_ambiguous[..., restype, chi_idx + 4] = 1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 1, 1] = -1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 2, 2] = -1
+
+    residx_rigidgroup_is_ambiguous = batched_gather(
+        restype_rigidgroup_is_ambiguous,
+        aatype,
+        dim=-2,
+        num_batch_dims=batch_dims,
+    )
+
+    residx_rigidgroup_ambiguity_rot = batched_gather(
+        restype_rigidgroup_rots,
+        aatype,
+        dim=-4,
+        num_batch_dims=batch_dims,
+    )
+
+    residx_rigidgroup_ambiguity_rot = Rotation(
+        mat=residx_rigidgroup_ambiguity_rot)
+    alt_gt_frames = gt_frames.compose(
+        Frame(residx_rigidgroup_ambiguity_rot, None))
+
+    gt_frames_tensor = gt_frames.to_tensor_4x4()
+    alt_gt_frames_tensor = alt_gt_frames.to_tensor_4x4()
+
+    protein['rigidgroups_gt_frames'] = gt_frames_tensor
+    protein['rigidgroups_gt_exists'] = gt_exists
+    protein['rigidgroups_group_exists'] = group_exists
+    protein['rigidgroups_group_is_ambiguous'] = residx_rigidgroup_is_ambiguous
+    protein['rigidgroups_alt_gt_frames'] = alt_gt_frames_tensor
+
+    return protein
+
+
+@curry1
+def atom37_to_torsion_angles(
+    protein,
+    prefix='',
+):
+    aatype = protein[prefix + 'aatype']
+    all_atom_positions = protein[prefix + 'all_atom_positions']
+    all_atom_mask = protein[prefix + 'all_atom_mask']
+    if aatype.shape[-1] == 0:
+        base_shape = aatype.shape
+        protein[prefix
+                + 'torsion_angles_sin_cos'] = all_atom_positions.new_zeros(
+                    *base_shape, 7, 2)
+        protein[prefix
+                + 'alt_torsion_angles_sin_cos'] = all_atom_positions.new_zeros(
+                    *base_shape, 7, 2)
+        protein[prefix + 'torsion_angles_mask'] = all_atom_positions.new_zeros(
+            *base_shape, 7)
+        return protein
+
+    aatype = torch.clamp(aatype, max=20)
+
+    pad = all_atom_positions.new_zeros(
+        [*all_atom_positions.shape[:-3], 1, 37, 3])
+    prev_all_atom_positions = torch.cat(
+        [pad, all_atom_positions[..., :-1, :, :]], dim=-3)
+
+    pad = all_atom_mask.new_zeros([*all_atom_mask.shape[:-2], 1, 37])
+    prev_all_atom_mask = torch.cat([pad, all_atom_mask[..., :-1, :]], dim=-2)
+
+    pre_omega_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 1:3, :], all_atom_positions[..., :2, :]],
+        dim=-2,
+    )
+    phi_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 2:3, :], all_atom_positions[..., :3, :]],
+        dim=-2,
+    )
+    psi_atom_pos = torch.cat(
+        [all_atom_positions[..., :3, :], all_atom_positions[..., 4:5, :]],
+        dim=-2,
+    )
+
+    pre_omega_mask = torch.prod(
+        prev_all_atom_mask[..., 1:3], dim=-1) * torch.prod(
+            all_atom_mask[..., :2], dim=-1)
+    phi_mask = prev_all_atom_mask[..., 2] * torch.prod(
+        all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
+    psi_mask = (
+        torch.prod(all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
+        * all_atom_mask[..., 4])
+
+    chi_atom_indices = torch.as_tensor(
+        rc.chi_atom_indices, device=aatype.device)
+
+    atom_indices = chi_atom_indices[..., aatype, :, :]
+    chis_atom_pos = batched_gather(all_atom_positions, atom_indices, -2,
+                                   len(atom_indices.shape[:-2]))
+
+    chi_angles_mask = list(rc.chi_angles_mask)
+    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
+    chi_angles_mask = all_atom_mask.new_tensor(chi_angles_mask)
+
+    chis_mask = chi_angles_mask[aatype, :]
+
+    chi_angle_atoms_mask = batched_gather(
+        all_atom_mask,
+        atom_indices,
+        dim=-1,
+        num_batch_dims=len(atom_indices.shape[:-2]),
+    )
+    chi_angle_atoms_mask = torch.prod(
+        chi_angle_atoms_mask, dim=-1, dtype=chi_angle_atoms_mask.dtype)
+    chis_mask = chis_mask * chi_angle_atoms_mask
+
+    torsions_atom_pos = torch.cat(
+        [
+            pre_omega_atom_pos[..., None, :, :],
+            phi_atom_pos[..., None, :, :],
+            psi_atom_pos[..., None, :, :],
+            chis_atom_pos,
+        ],
+        dim=-3,
+    )
+
+    torsion_angles_mask = torch.cat(
+        [
+            pre_omega_mask[..., None],
+            phi_mask[..., None],
+            psi_mask[..., None],
+            chis_mask,
+        ],
+        dim=-1,
+    )
+
+    torsion_frames = Frame.from_3_points(
+        torsions_atom_pos[..., 1, :],
+        torsions_atom_pos[..., 2, :],
+        torsions_atom_pos[..., 0, :],
+        eps=1e-8,
+    )
+
+    fourth_atom_rel_pos = torsion_frames.invert().apply(
+        torsions_atom_pos[..., 3, :])
+
+    torsion_angles_sin_cos = torch.stack(
+        [fourth_atom_rel_pos[..., 2], fourth_atom_rel_pos[..., 1]], dim=-1)
+
+    denom = torch.sqrt(
+        torch.sum(
+            torch.square(torsion_angles_sin_cos),
+            dim=-1,
+            dtype=torsion_angles_sin_cos.dtype,
+            keepdims=True,
+        ) + 1e-8)
+    torsion_angles_sin_cos = torsion_angles_sin_cos / denom
+
+    torsion_angles_sin_cos = (
+        torsion_angles_sin_cos
+        * all_atom_mask.new_tensor([1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0], )[
+            ((None, ) * len(torsion_angles_sin_cos.shape[:-2]))
+            + (slice(None), None)])
+
+    chi_is_ambiguous = torsion_angles_sin_cos.new_tensor(
+        rc.chi_pi_periodic, )[aatype, ...]
+
+    mirror_torsion_angles = torch.cat(
+        [
+            all_atom_mask.new_ones(*aatype.shape, 3),
+            1.0 - 2.0 * chi_is_ambiguous,
+        ],
+        dim=-1,
+    )
+
+    alt_torsion_angles_sin_cos = (
+        torsion_angles_sin_cos * mirror_torsion_angles[..., None])
+
+    if prefix == '':
+        # consistent to uni-fold. use [1, 0] placeholder
+        placeholder_torsions = torch.stack(
+            [
+                torch.ones(torsion_angles_sin_cos.shape[:-1]),
+                torch.zeros(torsion_angles_sin_cos.shape[:-1]),
+            ],
+            dim=-1,
+        )
+        torsion_angles_sin_cos = torsion_angles_sin_cos * torsion_angles_mask[
+            ...,
+            None] + placeholder_torsions * (1 - torsion_angles_mask[..., None])
+        alt_torsion_angles_sin_cos = alt_torsion_angles_sin_cos * torsion_angles_mask[
+            ...,
+            None] + placeholder_torsions * (1 - torsion_angles_mask[..., None])
+
+    protein[prefix + 'torsion_angles_sin_cos'] = torsion_angles_sin_cos
+    protein[prefix + 'alt_torsion_angles_sin_cos'] = alt_torsion_angles_sin_cos
+    protein[prefix + 'torsion_angles_mask'] = torsion_angles_mask
+
+    return protein
+
+
+def get_backbone_frames(protein):
+    protein['true_frame_tensor'] = protein['rigidgroups_gt_frames'][...,
+                                                                    0, :, :]
+    protein['frame_mask'] = protein['rigidgroups_gt_exists'][..., 0]
+
+    return protein
+
+
+def get_chi_angles(protein):
+    dtype = protein['all_atom_mask'].dtype
+    protein['chi_angles_sin_cos'] = (
+        protein['torsion_angles_sin_cos'][..., 3:, :]).to(dtype)
+    protein['chi_mask'] = protein['torsion_angles_mask'][..., 3:].to(dtype)
+
+    return protein
+
+
+@curry1
+def crop_templates(
+    protein,
+    max_templates,
+    subsample_templates=False,
+):
+    if 'template_mask' in protein:
+        num_templates = protein['template_mask'].shape[-1]
+    else:
+        num_templates = 0
+
+    # don't sample when there are no templates
+    if num_templates > 0:
+        if subsample_templates:
+            # af2's sampling, min(4, uniform[0, n])
+            max_templates = min(max_templates,
+                                np.random.randint(0, num_templates + 1))
+            template_idx = torch.tensor(
+                np.random.choice(num_templates, max_templates, replace=False),
+                dtype=torch.int64,
+            )
+        else:
+            # use top templates
+            template_idx = torch.arange(
+                min(num_templates, max_templates), dtype=torch.int64)
+        for k, v in protein.items():
+            if k.startswith('template'):
+                try:
+                    v = v[template_idx]
+                except Exception as ex:
+                    print(ex.__class__, ex)
+                    print('num_templates', num_templates)
+                    print(k, v.shape)
+                    print('protein:', protein)
+                    print(
+                        'protein_shape:',
+                        {
+                            k: v.shape
+                            for k, v in protein.items() if 'shape' in dir(v)
+                        },
+                    )
+            protein[k] = v
+
+    return protein
+
+
+@curry1
+def crop_to_size_single(protein, crop_size, shape_schema, seed):
+    """crop to size."""
+    num_res = (
+        protein['aatype'].shape[0]
+        if 'aatype' in protein else protein['msa_mask'].shape[1])
+    crop_idx = get_single_crop_idx(num_res, crop_size, seed)
+    protein = apply_crop_idx(protein, shape_schema, crop_idx)
+    return protein
+
+
+@curry1
+def crop_to_size_multimer(protein, crop_size, shape_schema, seed,
+                          spatial_crop_prob, ca_ca_threshold):
+    """crop to size."""
+    with data_utils.numpy_seed(seed, key='multimer_crop'):
+        use_spatial_crop = np.random.rand() < spatial_crop_prob
+    is_distillation = 'is_distillation' in protein and protein[
+        'is_distillation'] == 1
+    if is_distillation:
+        return crop_to_size_single(
+            crop_size=crop_size, shape_schema=shape_schema, seed=seed)(
+                protein)
+    elif use_spatial_crop:
+        crop_idx = get_spatial_crop_idx(protein, crop_size, seed,
+                                        ca_ca_threshold)
+    else:
+        crop_idx = get_contiguous_crop_idx(protein, crop_size, seed)
+    return apply_crop_idx(protein, shape_schema, crop_idx)
+
+
+def get_single_crop_idx(num_res: NumpyDict, crop_size: int,
+                        random_seed: Optional[int]) -> torch.Tensor:
+
+    if num_res < crop_size:
+        return torch.arange(num_res)
+    with data_utils.numpy_seed(random_seed):
+        crop_start = int(np.random.randint(0, num_res - crop_size + 1))
+        return torch.arange(crop_start, crop_start + crop_size)
+
+
+def get_crop_sizes_each_chain(
+    asym_len: torch.Tensor,
+    crop_size: int,
+    random_seed: Optional[int] = None,
+    use_multinomial: bool = False,
+) -> torch.Tensor:
+    """get crop sizes for contiguous crop"""
+    if not use_multinomial:
+        with data_utils.numpy_seed(
+                random_seed, key='multimer_contiguous_perm'):
+            shuffle_idx = np.random.permutation(len(asym_len))
+        num_left = asym_len.sum()
+        num_budget = torch.tensor(crop_size)
+        crop_sizes = [0 for _ in asym_len]
+        for j, idx in enumerate(shuffle_idx):
+            this_len = asym_len[idx]
+            num_left -= this_len
+            # num res at most we can keep in this ent
+            max_size = min(num_budget, this_len)
+            # num res at least we shall keep in this ent
+            min_size = min(this_len, max(0, num_budget - num_left))
+            with data_utils.numpy_seed(
+                    random_seed, j, key='multimer_contiguous_crop_size'):
+                this_crop_size = int(
+                    np.random.randint(
+                        low=int(min_size), high=int(max_size) + 1))
+            num_budget -= this_crop_size
+            crop_sizes[idx] = this_crop_size
+        crop_sizes = torch.tensor(crop_sizes)
+    else:  # use multinomial
+        # TODO: better multimer
+        entity_probs = asym_len / torch.sum(asym_len)
+        crop_sizes = torch.from_numpy(
+            np.random.multinomial(crop_size, pvals=entity_probs))
+        crop_sizes = torch.min(crop_sizes, asym_len)
+    return crop_sizes
+
+
+def get_contiguous_crop_idx(
+    protein: NumpyDict,
+    crop_size: int,
+    random_seed: Optional[int] = None,
+    use_multinomial: bool = False,
+) -> torch.Tensor:
+
+    num_res = protein['aatype'].shape[0]
+    if num_res <= crop_size:
+        return torch.arange(num_res)
+
+    assert 'asym_len' in protein
+    asym_len = protein['asym_len']
+
+    crop_sizes = get_crop_sizes_each_chain(asym_len, crop_size, random_seed,
+                                           use_multinomial)
+    crop_idxs = []
+    asym_offset = torch.tensor(0, dtype=torch.int64)
+    with data_utils.numpy_seed(
+            random_seed, key='multimer_contiguous_crop_start_idx'):
+        for ll, csz in zip(asym_len, crop_sizes):
+            this_start = np.random.randint(0, int(ll - csz) + 1)
+            crop_idxs.append(
+                torch.arange(asym_offset + this_start,
+                             asym_offset + this_start + csz))
+            asym_offset += ll
+
+    return torch.concat(crop_idxs)
+
+
+def get_spatial_crop_idx(
+    protein: NumpyDict,
+    crop_size: int,
+    random_seed: int,
+    ca_ca_threshold: float,
+    inf: float = 3e4,
+) -> List[int]:
+
+    ca_idx = rc.atom_order['CA']
+    ca_coords = protein['all_atom_positions'][..., ca_idx, :]
+    ca_mask = protein['all_atom_mask'][..., ca_idx].bool()
+    # if there are not enough atoms to construct interface, use contiguous crop
+    if (ca_mask.sum(dim=-1) <= 1).all():
+        return get_contiguous_crop_idx(protein, crop_size, random_seed)
+
+    pair_mask = ca_mask[..., None] * ca_mask[..., None, :]
+    ca_distances = get_pairwise_distances(ca_coords)
+
+    interface_candidates = get_interface_candidates(ca_distances,
+                                                    protein['asym_id'],
+                                                    pair_mask, ca_ca_threshold)
+
+    if torch.any(interface_candidates):
+        with data_utils.numpy_seed(random_seed, key='multimer_spatial_crop'):
+            target_res = int(np.random.choice(interface_candidates))
+    else:
+        return get_contiguous_crop_idx(protein, crop_size, random_seed)
+
+    to_target_distances = ca_distances[target_res]
+    # set inf to non-position residues
+    to_target_distances[~ca_mask] = inf
+    break_tie = (
+        torch.arange(
+            0,
+            to_target_distances.shape[-1],
+            device=to_target_distances.device).float() * 1e-3)
+    to_target_distances += break_tie
+    ret = torch.argsort(to_target_distances)[:crop_size]
+    return ret.sort().values
+
+
+def get_pairwise_distances(coords: torch.Tensor) -> torch.Tensor:
+    coord_diff = coords.unsqueeze(-2) - coords.unsqueeze(-3)
+    return torch.sqrt(torch.sum(coord_diff**2, dim=-1))
+
+
+def get_interface_candidates(
+    ca_distances: torch.Tensor,
+    asym_id: torch.Tensor,
+    pair_mask: torch.Tensor,
+    ca_ca_threshold,
+) -> torch.Tensor:
+
+    in_same_asym = asym_id[..., None] == asym_id[..., None, :]
+    # set distance in the same entity to zero
+    ca_distances = ca_distances * (1.0 - in_same_asym.float()) * pair_mask
+    cnt_interfaces = torch.sum(
+        (ca_distances > 0) & (ca_distances < ca_ca_threshold), dim=-1)
+    interface_candidates = cnt_interfaces.nonzero(as_tuple=True)[0]
+    return interface_candidates
+
+
+def apply_crop_idx(protein, shape_schema, crop_idx):
+    cropped_protein = {}
+    for k, v in protein.items():
+        if k not in shape_schema:  # skip items with unknown shape schema
+            continue
+        for i, dim_size in enumerate(shape_schema[k]):
+            if dim_size == N_RES:
+                v = torch.index_select(v, i, crop_idx)
+        cropped_protein[k] = v
+    return cropped_protein
diff --git a/modelscope/models/science/unifold/data/msa_pairing.py b/modelscope/models/science/unifold/data/msa_pairing.py
new file mode 100644
index 00000000..cc65962c
--- /dev/null
+++ b/modelscope/models/science/unifold/data/msa_pairing.py
@@ -0,0 +1,526 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pairing logic for multimer data """
+
+import collections
+from typing import Dict, Iterable, List, Sequence
+
+import numpy as np
+import pandas as pd
+import scipy.linalg
+
+from .data_ops import NumpyDict
+from .residue_constants import restypes_with_x_and_gap
+
+MSA_GAP_IDX = restypes_with_x_and_gap.index('-')
+SEQUENCE_GAP_CUTOFF = 0.5
+SEQUENCE_SIMILARITY_CUTOFF = 0.9
+
+MSA_PAD_VALUES = {
+    'msa_all_seq': MSA_GAP_IDX,
+    'msa_mask_all_seq': 1,
+    'deletion_matrix_all_seq': 0,
+    'deletion_matrix_int_all_seq': 0,
+    'msa': MSA_GAP_IDX,
+    'msa_mask': 1,
+    'deletion_matrix': 0,
+    'deletion_matrix_int': 0,
+}
+
+MSA_FEATURES = ('msa', 'msa_mask', 'deletion_matrix', 'deletion_matrix_int')
+SEQ_FEATURES = (
+    'residue_index',
+    'aatype',
+    'all_atom_positions',
+    'all_atom_mask',
+    'seq_mask',
+    'between_segment_residues',
+    'has_alt_locations',
+    'has_hetatoms',
+    'asym_id',
+    'entity_id',
+    'sym_id',
+    'entity_mask',
+    'deletion_mean',
+    'prediction_atom_mask',
+    'literature_positions',
+    'atom_indices_to_group_indices',
+    'rigid_group_default_frame',
+    # zy
+    'num_sym',
+)
+TEMPLATE_FEATURES = (
+    'template_aatype',
+    'template_all_atom_positions',
+    'template_all_atom_mask',
+)
+CHAIN_FEATURES = ('num_alignments', 'seq_length')
+
+
+def create_paired_features(chains: Iterable[NumpyDict], ) -> List[NumpyDict]:
+    """Returns the original chains with paired NUM_SEQ features.
+
+    Args:
+        chains:    A list of feature dictionaries for each chain.
+
+    Returns:
+        A list of feature dictionaries with sequence features including only
+        rows to be paired.
+    """
+    chains = list(chains)
+    chain_keys = chains[0].keys()
+
+    if len(chains) < 2:
+        return chains
+    else:
+        updated_chains = []
+        paired_chains_to_paired_row_indices = pair_sequences(chains)
+        paired_rows = reorder_paired_rows(paired_chains_to_paired_row_indices)
+
+        for chain_num, chain in enumerate(chains):
+            new_chain = {k: v for k, v in chain.items() if '_all_seq' not in k}
+            for feature_name in chain_keys:
+                if feature_name.endswith('_all_seq'):
+                    feats_padded = pad_features(chain[feature_name],
+                                                feature_name)
+                    new_chain[feature_name] = feats_padded[
+                        paired_rows[:, chain_num]]
+            new_chain['num_alignments_all_seq'] = np.asarray(
+                len(paired_rows[:, chain_num]))
+            updated_chains.append(new_chain)
+        return updated_chains
+
+
+def pad_features(feature: np.ndarray, feature_name: str) -> np.ndarray:
+    """Add a 'padding' row at the end of the features list.
+
+    The padding row will be selected as a 'paired' row in the case of partial
+    alignment - for the chain that doesn't have paired alignment.
+
+    Args:
+        feature: The feature to be padded.
+        feature_name: The name of the feature to be padded.
+
+    Returns:
+        The feature with an additional padding row.
+    """
+    assert feature.dtype != np.dtype(np.string_)
+    if feature_name in (
+            'msa_all_seq',
+            'msa_mask_all_seq',
+            'deletion_matrix_all_seq',
+            'deletion_matrix_int_all_seq',
+    ):
+        num_res = feature.shape[1]
+        padding = MSA_PAD_VALUES[feature_name] * np.ones([1, num_res],
+                                                         feature.dtype)
+    elif feature_name == 'msa_species_identifiers_all_seq':
+        padding = [b'']
+    else:
+        return feature
+    feats_padded = np.concatenate([feature, padding], axis=0)
+    return feats_padded
+
+
+def _make_msa_df(chain_features: NumpyDict) -> pd.DataFrame:
+    """Makes dataframe with msa features needed for msa pairing."""
+    chain_msa = chain_features['msa_all_seq']
+    query_seq = chain_msa[0]
+    per_seq_similarity = np.sum(
+        query_seq[None] == chain_msa, axis=-1) / float(len(query_seq))
+    per_seq_gap = np.sum(chain_msa == 21, axis=-1) / float(len(query_seq))
+    msa_df = pd.DataFrame({
+        'msa_species_identifiers':
+        chain_features['msa_species_identifiers_all_seq'],
+        'msa_row':
+        np.arange(len(chain_features['msa_species_identifiers_all_seq'])),
+        'msa_similarity':
+        per_seq_similarity,
+        'gap':
+        per_seq_gap,
+    })
+    return msa_df
+
+
+def _create_species_dict(msa_df: pd.DataFrame) -> Dict[bytes, pd.DataFrame]:
+    """Creates mapping from species to msa dataframe of that species."""
+    species_lookup = {}
+    for species, species_df in msa_df.groupby('msa_species_identifiers'):
+        species_lookup[species] = species_df
+    return species_lookup
+
+
+def _match_rows_by_sequence_similarity(
+    this_species_msa_dfs: List[pd.DataFrame], ) -> List[List[int]]:  # noqa
+    """Finds MSA sequence pairings across chains based on sequence similarity.
+
+    Each chain's MSA sequences are first sorted by their sequence similarity to
+    their respective target sequence. The sequences are then paired, starting
+    from the sequences most similar to their target sequence.
+
+    Args:
+        this_species_msa_dfs: a list of dataframes containing MSA features for
+            sequences for a specific species.
+
+    Returns:
+     A list of lists, each containing M indices corresponding to paired MSA rows,
+     where M is the number of chains.
+    """
+    all_paired_msa_rows = []
+
+    num_seqs = [
+        len(species_df) for species_df in this_species_msa_dfs
+        if species_df is not None
+    ]
+    take_num_seqs = np.min(num_seqs)
+
+    # sort_by_similarity = lambda x: x.sort_values(
+    #     'msa_similarity', axis=0, ascending=False)
+
+    def sort_by_similarity(x):
+        return x.sort_values('msa_similarity', axis=0, ascending=False)
+
+    for species_df in this_species_msa_dfs:
+        if species_df is not None:
+            species_df_sorted = sort_by_similarity(species_df)
+            msa_rows = species_df_sorted.msa_row.iloc[:take_num_seqs].values
+        else:
+            msa_rows = [-1] * take_num_seqs  # take the last 'padding' row
+        all_paired_msa_rows.append(msa_rows)
+    all_paired_msa_rows = list(np.array(all_paired_msa_rows).transpose())
+    return all_paired_msa_rows
+
+
+def pair_sequences(examples: List[NumpyDict]) -> Dict[int, np.ndarray]:
+    """Returns indices for paired MSA sequences across chains."""
+
+    num_examples = len(examples)
+
+    all_chain_species_dict = []
+    common_species = set()
+    for chain_features in examples:
+        msa_df = _make_msa_df(chain_features)
+        species_dict = _create_species_dict(msa_df)
+        all_chain_species_dict.append(species_dict)
+        common_species.update(set(species_dict))
+
+    common_species = sorted(common_species)
+    common_species.remove(b'')  # Remove target sequence species.
+
+    all_paired_msa_rows = [np.zeros(len(examples), int)]
+    all_paired_msa_rows_dict = {k: [] for k in range(num_examples)}
+    all_paired_msa_rows_dict[num_examples] = [np.zeros(len(examples), int)]
+
+    for species in common_species:
+        if not species:
+            continue
+        this_species_msa_dfs = []
+        species_dfs_present = 0
+        for species_dict in all_chain_species_dict:
+            if species in species_dict:
+                this_species_msa_dfs.append(species_dict[species])
+                species_dfs_present += 1
+            else:
+                this_species_msa_dfs.append(None)
+
+        # Skip species that are present in only one chain.
+        if species_dfs_present <= 1:
+            continue
+
+        if np.any(
+                np.array([
+                    len(species_df) for species_df in this_species_msa_dfs
+                    if isinstance(species_df, pd.DataFrame)
+                ]) > 600):
+            continue
+
+        paired_msa_rows = _match_rows_by_sequence_similarity(
+            this_species_msa_dfs)
+        all_paired_msa_rows.extend(paired_msa_rows)
+        all_paired_msa_rows_dict[species_dfs_present].extend(paired_msa_rows)
+    all_paired_msa_rows_dict = {
+        num_examples: np.array(paired_msa_rows)
+        for num_examples, paired_msa_rows in all_paired_msa_rows_dict.items()
+    }
+    return all_paired_msa_rows_dict
+
+
+def reorder_paired_rows(
+        all_paired_msa_rows_dict: Dict[int, np.ndarray]) -> np.ndarray:
+    """Creates a list of indices of paired MSA rows across chains.
+
+    Args:
+        all_paired_msa_rows_dict: a mapping from the number of paired chains to the
+            paired indices.
+
+    Returns:
+        a list of lists, each containing indices of paired MSA rows across chains.
+        The paired-index lists are ordered by:
+            1) the number of chains in the paired alignment, i.e, all-chain pairings
+                 will come first.
+            2) e-values
+    """
+    all_paired_msa_rows = []
+
+    for num_pairings in sorted(all_paired_msa_rows_dict, reverse=True):
+        paired_rows = all_paired_msa_rows_dict[num_pairings]
+        paired_rows_product = np.abs(
+            np.array(
+                [np.prod(rows.astype(np.float64)) for rows in paired_rows]))
+        paired_rows_sort_index = np.argsort(paired_rows_product)
+        all_paired_msa_rows.extend(paired_rows[paired_rows_sort_index])
+
+    return np.array(all_paired_msa_rows)
+
+
+def block_diag(*arrs: np.ndarray, pad_value: float = 0.0) -> np.ndarray:
+    """Like scipy.linalg.block_diag but with an optional padding value."""
+    ones_arrs = [np.ones_like(x) for x in arrs]
+    off_diag_mask = 1 - scipy.linalg.block_diag(*ones_arrs)
+    diag = scipy.linalg.block_diag(*arrs)
+    diag += (off_diag_mask * pad_value).astype(diag.dtype)
+    return diag
+
+
+def _correct_post_merged_feats(np_example: NumpyDict,
+                               np_chains_list: Sequence[NumpyDict],
+                               pair_msa_sequences: bool) -> NumpyDict:
+    """Adds features that need to be computed/recomputed post merging."""
+
+    np_example['seq_length'] = np.asarray(
+        np_example['aatype'].shape[0], dtype=np.int32)
+    np_example['num_alignments'] = np.asarray(
+        np_example['msa'].shape[0], dtype=np.int32)
+
+    if not pair_msa_sequences:
+        # Generate a bias that is 1 for the first row of every block in the
+        # block diagonal MSA - i.e. make sure the cluster stack always includes
+        # the query sequences for each chain (since the first row is the query
+        # sequence).
+        cluster_bias_masks = []
+        for chain in np_chains_list:
+            mask = np.zeros(chain['msa'].shape[0])
+            mask[0] = 1
+            cluster_bias_masks.append(mask)
+        np_example['cluster_bias_mask'] = np.concatenate(cluster_bias_masks)
+
+        # Initialize Bert mask with masked out off diagonals.
+        msa_masks = [
+            np.ones(x['msa'].shape, dtype=np.int8) for x in np_chains_list
+        ]
+
+        np_example['bert_mask'] = block_diag(*msa_masks, pad_value=0)
+    else:
+        np_example['cluster_bias_mask'] = np.zeros(np_example['msa'].shape[0])
+        np_example['cluster_bias_mask'][0] = 1
+
+        # Initialize Bert mask with masked out off diagonals.
+        msa_masks = [
+            np.ones(x['msa'].shape, dtype=np.int8) for x in np_chains_list
+        ]
+        msa_masks_all_seq = [
+            np.ones(x['msa_all_seq'].shape, dtype=np.int8)
+            for x in np_chains_list
+        ]
+
+        msa_mask_block_diag = block_diag(*msa_masks, pad_value=0)
+        msa_mask_all_seq = np.concatenate(msa_masks_all_seq, axis=1)
+        np_example['bert_mask'] = np.concatenate(
+            [msa_mask_all_seq, msa_mask_block_diag], axis=0)
+    return np_example
+
+
+def _pad_templates(chains: Sequence[NumpyDict],
+                   max_templates: int) -> Sequence[NumpyDict]:
+    """For each chain pad the number of templates to a fixed size.
+
+    Args:
+        chains: A list of protein chains.
+        max_templates: Each chain will be padded to have this many templates.
+
+    Returns:
+        The list of chains, updated to have template features padded to
+        max_templates.
+    """
+    for chain in chains:
+        for k, v in chain.items():
+            if k in TEMPLATE_FEATURES:
+                padding = np.zeros_like(v.shape)
+                padding[0] = max_templates - v.shape[0]
+                padding = [(0, p) for p in padding]
+                chain[k] = np.pad(v, padding, mode='constant')
+    return chains
+
+
+def _merge_features_from_multiple_chains(
+        chains: Sequence[NumpyDict], pair_msa_sequences: bool) -> NumpyDict:
+    """Merge features from multiple chains.
+
+    Args:
+        chains: A list of feature dictionaries that we want to merge.
+        pair_msa_sequences: Whether to concatenate MSA features along the
+            num_res dimension (if True), or to block diagonalize them (if False).
+
+    Returns:
+        A feature dictionary for the merged example.
+    """
+    merged_example = {}
+    for feature_name in chains[0]:
+        feats = [x[feature_name] for x in chains]
+        feature_name_split = feature_name.split('_all_seq')[0]
+        if feature_name_split in MSA_FEATURES:
+            if pair_msa_sequences or '_all_seq' in feature_name:
+                merged_example[feature_name] = np.concatenate(feats, axis=1)
+                if feature_name_split == 'msa':
+                    merged_example['msa_chains_all_seq'] = np.ones(
+                        merged_example[feature_name].shape[0]).reshape(-1, 1)
+            else:
+                merged_example[feature_name] = block_diag(
+                    *feats, pad_value=MSA_PAD_VALUES[feature_name])
+                if feature_name_split == 'msa':
+                    msa_chains = []
+                    for i, feat in enumerate(feats):
+                        cur_shape = feat.shape[0]
+                        vals = np.ones(cur_shape) * (i + 2)
+                        msa_chains.append(vals)
+                    merged_example['msa_chains'] = np.concatenate(
+                        msa_chains).reshape(-1, 1)
+        elif feature_name_split in SEQ_FEATURES:
+            merged_example[feature_name] = np.concatenate(feats, axis=0)
+        elif feature_name_split in TEMPLATE_FEATURES:
+            merged_example[feature_name] = np.concatenate(feats, axis=1)
+        elif feature_name_split in CHAIN_FEATURES:
+            merged_example[feature_name] = np.sum(feats).astype(np.int32)
+        else:
+            merged_example[feature_name] = feats[0]
+    return merged_example
+
+
+def _merge_homomers_dense_msa(
+        chains: Iterable[NumpyDict]) -> Sequence[NumpyDict]:
+    """Merge all identical chains, making the resulting MSA dense.
+
+    Args:
+        chains: An iterable of features for each chain.
+
+    Returns:
+        A list of feature dictionaries.    All features with the same entity_id
+        will be merged - MSA features will be concatenated along the num_res
+        dimension - making them dense.
+    """
+    entity_chains = collections.defaultdict(list)
+    for chain in chains:
+        entity_id = chain['entity_id'][0]
+        entity_chains[entity_id].append(chain)
+
+    grouped_chains = []
+    for entity_id in sorted(entity_chains):
+        chains = entity_chains[entity_id]
+        grouped_chains.append(chains)
+    chains = [
+        _merge_features_from_multiple_chains(chains, pair_msa_sequences=True)
+        for chains in grouped_chains
+    ]
+    return chains
+
+
+def _concatenate_paired_and_unpaired_features(example: NumpyDict) -> NumpyDict:
+    """Merges paired and block-diagonalised features."""
+    features = MSA_FEATURES + ('msa_chains', )
+    for feature_name in features:
+        if feature_name in example:
+            feat = example[feature_name]
+            feat_all_seq = example[feature_name + '_all_seq']
+            try:
+                merged_feat = np.concatenate([feat_all_seq, feat], axis=0)
+            except Exception as ex:
+                raise Exception(
+                    'concat failed.',
+                    feature_name,
+                    feat_all_seq.shape,
+                    feat.shape,
+                    ex.__class__,
+                    ex,
+                )
+            example[feature_name] = merged_feat
+    example['num_alignments'] = np.array(
+        example['msa'].shape[0], dtype=np.int32)
+    return example
+
+
+def merge_chain_features(np_chains_list: List[NumpyDict],
+                         pair_msa_sequences: bool,
+                         max_templates: int) -> NumpyDict:
+    """Merges features for multiple chains to single FeatureDict.
+
+    Args:
+        np_chains_list: List of FeatureDicts for each chain.
+        pair_msa_sequences: Whether to merge paired MSAs.
+        max_templates: The maximum number of templates to include.
+
+    Returns:
+        Single FeatureDict for entire complex.
+    """
+    np_chains_list = _pad_templates(
+        np_chains_list, max_templates=max_templates)
+    np_chains_list = _merge_homomers_dense_msa(np_chains_list)
+    # Unpaired MSA features will be always block-diagonalised; paired MSA
+    # features will be concatenated.
+    np_example = _merge_features_from_multiple_chains(
+        np_chains_list, pair_msa_sequences=False)
+    if pair_msa_sequences:
+        np_example = _concatenate_paired_and_unpaired_features(np_example)
+    np_example = _correct_post_merged_feats(
+        np_example=np_example,
+        np_chains_list=np_chains_list,
+        pair_msa_sequences=pair_msa_sequences,
+    )
+
+    return np_example
+
+
+def deduplicate_unpaired_sequences(
+        np_chains: List[NumpyDict]) -> List[NumpyDict]:
+    """Removes unpaired sequences which duplicate a paired sequence."""
+
+    feature_names = np_chains[0].keys()
+    msa_features = MSA_FEATURES
+    cache_msa_features = {}
+    for chain in np_chains:
+        entity_id = int(chain['entity_id'][0])
+        if entity_id not in cache_msa_features:
+            sequence_set = set(s.tobytes() for s in chain['msa_all_seq'])
+            keep_rows = []
+            # Go through unpaired MSA seqs and remove any rows that correspond to the
+            # sequences that are already present in the paired MSA.
+            for row_num, seq in enumerate(chain['msa']):
+                if seq.tobytes() not in sequence_set:
+                    keep_rows.append(row_num)
+            new_msa_features = {}
+            for feature_name in feature_names:
+                if feature_name in msa_features:
+                    if keep_rows:
+                        new_msa_features[feature_name] = chain[feature_name][
+                            keep_rows]
+                    else:
+                        new_shape = list(chain[feature_name].shape)
+                        new_shape[0] = 0
+                        new_msa_features[feature_name] = np.zeros(
+                            new_shape, dtype=chain[feature_name].dtype)
+            cache_msa_features[entity_id] = new_msa_features
+        for feature_name in cache_msa_features[entity_id]:
+            chain[feature_name] = cache_msa_features[entity_id][feature_name]
+        chain['num_alignments'] = np.array(
+            chain['msa'].shape[0], dtype=np.int32)
+    return np_chains
diff --git a/modelscope/models/science/unifold/data/process.py b/modelscope/models/science/unifold/data/process.py
new file mode 100644
index 00000000..3987cb1c
--- /dev/null
+++ b/modelscope/models/science/unifold/data/process.py
@@ -0,0 +1,264 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from typing import Optional
+
+import numpy as np
+import torch
+
+from modelscope.models.science.unifold.data import data_ops
+
+
+def nonensembled_fns(common_cfg, mode_cfg):
+    """Input pipeline data transformers that are not ensembled."""
+    v2_feature = common_cfg.v2_feature
+    operators = []
+    if mode_cfg.random_delete_msa:
+        operators.append(
+            data_ops.random_delete_msa(common_cfg.random_delete_msa))
+    operators.extend([
+        data_ops.cast_to_64bit_ints,
+        data_ops.correct_msa_restypes,
+        data_ops.squeeze_features,
+        data_ops.randomly_replace_msa_with_unknown(0.0),
+        data_ops.make_seq_mask,
+        data_ops.make_msa_mask,
+    ])
+    operators.append(data_ops.make_hhblits_profile_v2
+                     if v2_feature else data_ops.make_hhblits_profile)
+    if common_cfg.use_templates:
+        operators.extend([
+            data_ops.make_template_mask,
+            data_ops.make_pseudo_beta('template_'),
+        ])
+        operators.append(
+            data_ops.crop_templates(
+                max_templates=mode_cfg.max_templates,
+                subsample_templates=mode_cfg.subsample_templates,
+            ))
+
+    if common_cfg.use_template_torsion_angles:
+        operators.extend([
+            data_ops.atom37_to_torsion_angles('template_'),
+        ])
+
+    operators.append(data_ops.make_atom14_masks)
+    operators.append(data_ops.make_target_feat)
+
+    return operators
+
+
+def crop_and_fix_size_fns(common_cfg, mode_cfg, crop_and_fix_size_seed):
+    operators = []
+    if common_cfg.reduce_msa_clusters_by_max_templates:
+        pad_msa_clusters = mode_cfg.max_msa_clusters - mode_cfg.max_templates
+    else:
+        pad_msa_clusters = mode_cfg.max_msa_clusters
+    crop_feats = dict(common_cfg.features)
+    if mode_cfg.fixed_size:
+        if mode_cfg.crop:
+            if common_cfg.is_multimer:
+                crop_fn = data_ops.crop_to_size_multimer(
+                    crop_size=mode_cfg.crop_size,
+                    shape_schema=crop_feats,
+                    seed=crop_and_fix_size_seed,
+                    spatial_crop_prob=mode_cfg.spatial_crop_prob,
+                    ca_ca_threshold=mode_cfg.ca_ca_threshold,
+                )
+            else:
+                crop_fn = data_ops.crop_to_size_single(
+                    crop_size=mode_cfg.crop_size,
+                    shape_schema=crop_feats,
+                    seed=crop_and_fix_size_seed,
+                )
+            operators.append(crop_fn)
+
+        operators.append(data_ops.select_feat(crop_feats))
+
+        operators.append(
+            data_ops.make_fixed_size(
+                crop_feats,
+                pad_msa_clusters,
+                common_cfg.max_extra_msa,
+                mode_cfg.crop_size,
+                mode_cfg.max_templates,
+            ))
+    return operators
+
+
+def ensembled_fns(common_cfg, mode_cfg):
+    """Input pipeline data transformers that can be ensembled and averaged."""
+    operators = []
+    multimer_mode = common_cfg.is_multimer
+    v2_feature = common_cfg.v2_feature
+    # multimer don't use block delete msa
+    if mode_cfg.block_delete_msa and not multimer_mode:
+        operators.append(
+            data_ops.block_delete_msa(common_cfg.block_delete_msa))
+    if 'max_distillation_msa_clusters' in mode_cfg:
+        operators.append(
+            data_ops.sample_msa_distillation(
+                mode_cfg.max_distillation_msa_clusters))
+
+    if common_cfg.reduce_msa_clusters_by_max_templates:
+        pad_msa_clusters = mode_cfg.max_msa_clusters - mode_cfg.max_templates
+    else:
+        pad_msa_clusters = mode_cfg.max_msa_clusters
+
+    max_msa_clusters = pad_msa_clusters
+    max_extra_msa = common_cfg.max_extra_msa
+
+    assert common_cfg.resample_msa_in_recycling
+    gumbel_sample = common_cfg.gumbel_sample
+    operators.append(
+        data_ops.sample_msa(
+            max_msa_clusters,
+            keep_extra=True,
+            gumbel_sample=gumbel_sample,
+            biased_msa_by_chain=mode_cfg.biased_msa_by_chain,
+        ))
+
+    if 'masked_msa' in common_cfg:
+        # Masked MSA should come *before* MSA clustering so that
+        # the clustering and full MSA profile do not leak information about
+        # the masked locations and secret corrupted locations.
+        operators.append(
+            data_ops.make_masked_msa(
+                common_cfg.masked_msa,
+                mode_cfg.masked_msa_replace_fraction,
+                gumbel_sample=gumbel_sample,
+                share_mask=mode_cfg.share_mask,
+            ))
+
+    if common_cfg.msa_cluster_features:
+        if v2_feature:
+            operators.append(data_ops.nearest_neighbor_clusters_v2())
+        else:
+            operators.append(data_ops.nearest_neighbor_clusters())
+            operators.append(data_ops.summarize_clusters)
+
+    if v2_feature:
+        operators.append(data_ops.make_msa_feat_v2)
+    else:
+        operators.append(data_ops.make_msa_feat)
+    # Crop after creating the cluster profiles.
+    if max_extra_msa:
+        if v2_feature:
+            operators.append(data_ops.make_extra_msa_feat(max_extra_msa))
+        else:
+            operators.append(data_ops.crop_extra_msa(max_extra_msa))
+    else:
+        operators.append(data_ops.delete_extra_msa)
+    # operators.append(data_operators.select_feat(common_cfg.recycling_features))
+    return operators
+
+
+def process_features(tensors, common_cfg, mode_cfg):
+    """Based on the config, apply filters and transformations to the data."""
+    is_distillation = bool(tensors.get('is_distillation', 0))
+    multimer_mode = common_cfg.is_multimer
+    crop_and_fix_size_seed = int(tensors['crop_and_fix_size_seed'])
+    crop_fn = crop_and_fix_size_fns(
+        common_cfg,
+        mode_cfg,
+        crop_and_fix_size_seed,
+    )
+
+    def wrap_ensemble_fn(data, i):
+        """Function to be mapped over the ensemble dimension."""
+        d = data.copy()
+        fns = ensembled_fns(
+            common_cfg,
+            mode_cfg,
+        )
+        new_d = compose(fns)(d)
+        if not multimer_mode or is_distillation:
+            new_d = data_ops.select_feat(common_cfg.recycling_features)(new_d)
+            return compose(crop_fn)(new_d)
+        else:  # select after crop for spatial cropping
+            d = compose(crop_fn)(d)
+            d = data_ops.select_feat(common_cfg.recycling_features)(d)
+            return d
+
+    nonensembled = nonensembled_fns(common_cfg, mode_cfg)
+
+    if mode_cfg.supervised and (not multimer_mode or is_distillation):
+        nonensembled.extend(label_transform_fn())
+
+    tensors = compose(nonensembled)(tensors)
+
+    num_recycling = int(tensors['num_recycling_iters']) + 1
+    num_ensembles = mode_cfg.num_ensembles
+
+    ensemble_tensors = map_fn(
+        lambda x: wrap_ensemble_fn(tensors, x),
+        torch.arange(num_recycling * num_ensembles),
+    )
+    tensors = compose(crop_fn)(tensors)
+    # add a dummy dim to align with recycling features
+    tensors = {k: torch.stack([tensors[k]], dim=0) for k in tensors}
+    tensors.update(ensemble_tensors)
+    return tensors
+
+
+@data_ops.curry1
+def compose(x, fs):
+    for f in fs:
+        x = f(x)
+    return x
+
+
+def pad_then_stack(values, ):
+    if len(values[0].shape) >= 1:
+        size = max(v.shape[0] for v in values)
+        new_values = []
+        for v in values:
+            if v.shape[0] < size:
+                res = values[0].new_zeros(size, *v.shape[1:])
+                res[:v.shape[0], ...] = v
+            else:
+                res = v
+            new_values.append(res)
+    else:
+        new_values = values
+    return torch.stack(new_values, dim=0)
+
+
+def map_fn(fun, x):
+    ensembles = [fun(elem) for elem in x]
+    features = ensembles[0].keys()
+    ensembled_dict = {}
+    for feat in features:
+        ensembled_dict[feat] = pad_then_stack(
+            [dict_i[feat] for dict_i in ensembles])
+    return ensembled_dict
+
+
+def process_single_label(label: dict,
+                         num_ensemble: Optional[int] = None) -> dict:
+    assert 'aatype' in label
+    assert 'all_atom_positions' in label
+    assert 'all_atom_mask' in label
+    label = compose(label_transform_fn())(label)
+    if num_ensemble is not None:
+        label = {
+            k: torch.stack([v for _ in range(num_ensemble)])
+            for k, v in label.items()
+        }
+    return label
+
+
+def process_labels(labels_list, num_ensemble: Optional[int] = None):
+    return [process_single_label(ll, num_ensemble) for ll in labels_list]
+
+
+def label_transform_fn():
+    return [
+        data_ops.make_atom14_masks,
+        data_ops.make_atom14_positions,
+        data_ops.atom37_to_frames,
+        data_ops.atom37_to_torsion_angles(''),
+        data_ops.make_pseudo_beta(''),
+        data_ops.get_backbone_frames,
+        data_ops.get_chi_angles,
+    ]
diff --git a/modelscope/models/science/unifold/data/process_multimer.py b/modelscope/models/science/unifold/data/process_multimer.py
new file mode 100644
index 00000000..04572d2d
--- /dev/null
+++ b/modelscope/models/science/unifold/data/process_multimer.py
@@ -0,0 +1,417 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature processing logic for multimer data """
+
+import collections
+from typing import Iterable, List, MutableMapping
+
+import numpy as np
+
+from modelscope.models.science.unifold.data import (msa_pairing,
+                                                    residue_constants)
+from .utils import correct_template_restypes
+
+FeatureDict = MutableMapping[str, np.ndarray]
+
+REQUIRED_FEATURES = frozenset({
+    'aatype',
+    'all_atom_mask',
+    'all_atom_positions',
+    'all_chains_entity_ids',
+    'all_crops_all_chains_mask',
+    'all_crops_all_chains_positions',
+    'all_crops_all_chains_residue_ids',
+    'assembly_num_chains',
+    'asym_id',
+    'bert_mask',
+    'cluster_bias_mask',
+    'deletion_matrix',
+    'deletion_mean',
+    'entity_id',
+    'entity_mask',
+    'mem_peak',
+    'msa',
+    'msa_mask',
+    'num_alignments',
+    'num_templates',
+    'queue_size',
+    'residue_index',
+    'resolution',
+    'seq_length',
+    'seq_mask',
+    'sym_id',
+    'template_aatype',
+    'template_all_atom_mask',
+    'template_all_atom_positions',
+    # zy added:
+    'asym_len',
+    'template_sum_probs',
+    'num_sym',
+    'msa_chains',
+})
+
+MAX_TEMPLATES = 4
+MSA_CROP_SIZE = 2048
+
+
+def _is_homomer_or_monomer(chains: Iterable[FeatureDict]) -> bool:
+    """Checks if a list of chains represents a homomer/monomer example."""
+    # Note that an entity_id of 0 indicates padding.
+    num_unique_chains = len(
+        np.unique(
+            np.concatenate([
+                np.unique(chain['entity_id'][chain['entity_id'] > 0])
+                for chain in chains
+            ])))
+    return num_unique_chains == 1
+
+
+def pair_and_merge(
+        all_chain_features: MutableMapping[str, FeatureDict]) -> FeatureDict:
+    """Runs processing on features to augment, pair and merge.
+
+    Args:
+        all_chain_features: A MutableMap of dictionaries of features for each chain.
+
+    Returns:
+        A dictionary of features.
+    """
+
+    process_unmerged_features(all_chain_features)
+
+    np_chains_list = all_chain_features
+
+    pair_msa_sequences = not _is_homomer_or_monomer(np_chains_list)
+
+    if pair_msa_sequences:
+        np_chains_list = msa_pairing.create_paired_features(
+            chains=np_chains_list)
+        np_chains_list = msa_pairing.deduplicate_unpaired_sequences(
+            np_chains_list)
+    np_chains_list = crop_chains(
+        np_chains_list,
+        msa_crop_size=MSA_CROP_SIZE,
+        pair_msa_sequences=pair_msa_sequences,
+        max_templates=MAX_TEMPLATES,
+    )
+    np_example = msa_pairing.merge_chain_features(
+        np_chains_list=np_chains_list,
+        pair_msa_sequences=pair_msa_sequences,
+        max_templates=MAX_TEMPLATES,
+    )
+    np_example = process_final(np_example)
+    return np_example
+
+
+def crop_chains(
+    chains_list: List[FeatureDict],
+    msa_crop_size: int,
+    pair_msa_sequences: bool,
+    max_templates: int,
+) -> List[FeatureDict]:
+    """Crops the MSAs for a set of chains.
+
+    Args:
+        chains_list: A list of chains to be cropped.
+        msa_crop_size: The total number of sequences to crop from the MSA.
+        pair_msa_sequences: Whether we are operating in sequence-pairing mode.
+        max_templates: The maximum templates to use per chain.
+
+    Returns:
+        The chains cropped.
+    """
+
+    # Apply the cropping.
+    cropped_chains = []
+    for chain in chains_list:
+        cropped_chain = _crop_single_chain(
+            chain,
+            msa_crop_size=msa_crop_size,
+            pair_msa_sequences=pair_msa_sequences,
+            max_templates=max_templates,
+        )
+        cropped_chains.append(cropped_chain)
+
+    return cropped_chains
+
+
+def _crop_single_chain(chain: FeatureDict, msa_crop_size: int,
+                       pair_msa_sequences: bool,
+                       max_templates: int) -> FeatureDict:
+    """Crops msa sequences to `msa_crop_size`."""
+    msa_size = chain['num_alignments']
+
+    if pair_msa_sequences:
+        msa_size_all_seq = chain['num_alignments_all_seq']
+        msa_crop_size_all_seq = np.minimum(msa_size_all_seq,
+                                           msa_crop_size // 2)
+
+        # We reduce the number of un-paired sequences, by the number of times a
+        # sequence from this chain's MSA is included in the paired MSA.    This keeps
+        # the MSA size for each chain roughly constant.
+        msa_all_seq = chain['msa_all_seq'][:msa_crop_size_all_seq, :]
+        num_non_gapped_pairs = np.sum(
+            np.any(msa_all_seq != msa_pairing.MSA_GAP_IDX, axis=1))
+        num_non_gapped_pairs = np.minimum(num_non_gapped_pairs,
+                                          msa_crop_size_all_seq)
+
+        # Restrict the unpaired crop size so that paired+unpaired sequences do not
+        # exceed msa_seqs_per_chain for each chain.
+        max_msa_crop_size = np.maximum(msa_crop_size - num_non_gapped_pairs, 0)
+        msa_crop_size = np.minimum(msa_size, max_msa_crop_size)
+    else:
+        msa_crop_size = np.minimum(msa_size, msa_crop_size)
+
+    include_templates = 'template_aatype' in chain and max_templates
+    if include_templates:
+        num_templates = chain['template_aatype'].shape[0]
+        templates_crop_size = np.minimum(num_templates, max_templates)
+
+    for k in chain:
+        k_split = k.split('_all_seq')[0]
+        if k_split in msa_pairing.TEMPLATE_FEATURES:
+            chain[k] = chain[k][:templates_crop_size, :]
+        elif k_split in msa_pairing.MSA_FEATURES:
+            if '_all_seq' in k and pair_msa_sequences:
+                chain[k] = chain[k][:msa_crop_size_all_seq, :]
+            else:
+                chain[k] = chain[k][:msa_crop_size, :]
+
+    chain['num_alignments'] = np.asarray(msa_crop_size, dtype=np.int32)
+    if include_templates:
+        chain['num_templates'] = np.asarray(
+            templates_crop_size, dtype=np.int32)
+    if pair_msa_sequences:
+        chain['num_alignments_all_seq'] = np.asarray(
+            msa_crop_size_all_seq, dtype=np.int32)
+    return chain
+
+
+def process_final(np_example: FeatureDict) -> FeatureDict:
+    """Final processing steps in data pipeline, after merging and pairing."""
+    np_example = _make_seq_mask(np_example)
+    np_example = _make_msa_mask(np_example)
+    np_example = _filter_features(np_example)
+    return np_example
+
+
+def _make_seq_mask(np_example):
+    np_example['seq_mask'] = (np_example['entity_id'] > 0).astype(np.float32)
+    return np_example
+
+
+def _make_msa_mask(np_example):
+    """Mask features are all ones, but will later be zero-padded."""
+
+    np_example['msa_mask'] = np.ones_like(np_example['msa'], dtype=np.int8)
+
+    seq_mask = (np_example['entity_id'] > 0).astype(np.int8)
+    np_example['msa_mask'] *= seq_mask[None]
+
+    return np_example
+
+
+def _filter_features(np_example: FeatureDict) -> FeatureDict:
+    """Filters features of example to only those requested."""
+    return {k: v for (k, v) in np_example.items() if k in REQUIRED_FEATURES}
+
+
+def process_unmerged_features(all_chain_features: MutableMapping[str,
+                                                                 FeatureDict]):
+    """Postprocessing stage for per-chain features before merging."""
+    num_chains = len(all_chain_features)
+    for chain_features in all_chain_features:
+        # Convert deletion matrices to float.
+        if 'deletion_matrix_int' in chain_features:
+            chain_features['deletion_matrix'] = np.asarray(
+                chain_features.pop('deletion_matrix_int'), dtype=np.float32)
+        if 'deletion_matrix_int_all_seq' in chain_features:
+            chain_features['deletion_matrix_all_seq'] = np.asarray(
+                chain_features.pop('deletion_matrix_int_all_seq'),
+                dtype=np.float32)
+
+        chain_features['deletion_mean'] = np.mean(
+            chain_features['deletion_matrix'], axis=0)
+
+        if 'all_atom_positions' not in chain_features:
+            # Add all_atom_mask and dummy all_atom_positions based on aatype.
+            all_atom_mask = residue_constants.STANDARD_ATOM_MASK[
+                chain_features['aatype']]
+            chain_features['all_atom_mask'] = all_atom_mask
+            chain_features['all_atom_positions'] = np.zeros(
+                list(all_atom_mask.shape) + [3])
+
+        # Add assembly_num_chains.
+        chain_features['assembly_num_chains'] = np.asarray(num_chains)
+
+    # Add entity_mask.
+    for chain_features in all_chain_features:
+        chain_features['entity_mask'] = (
+            chain_features['entity_id'] !=  # noqa W504
+            0).astype(np.int32)
+
+
+def empty_template_feats(n_res):
+    return {
+        'template_aatype':
+        np.zeros((0, n_res)).astype(np.int64),
+        'template_all_atom_positions':
+        np.zeros((0, n_res, 37, 3)).astype(np.float32),
+        'template_sum_probs':
+        np.zeros((0, 1)).astype(np.float32),
+        'template_all_atom_mask':
+        np.zeros((0, n_res, 37)).astype(np.float32),
+    }
+
+
+def convert_monomer_features(monomer_features: FeatureDict) -> FeatureDict:
+    """Reshapes and modifies monomer features for multimer models."""
+    if monomer_features['template_aatype'].shape[0] == 0:
+        monomer_features.update(
+            empty_template_feats(monomer_features['aatype'].shape[0]))
+    converted = {}
+    unnecessary_leading_dim_feats = {
+        'sequence',
+        'domain_name',
+        'num_alignments',
+        'seq_length',
+    }
+    for feature_name, feature in monomer_features.items():
+        if feature_name in unnecessary_leading_dim_feats:
+            # asarray ensures it's a np.ndarray.
+            feature = np.asarray(feature[0], dtype=feature.dtype)
+        elif feature_name == 'aatype':
+            # The multimer model performs the one-hot operation itself.
+            feature = np.argmax(feature, axis=-1).astype(np.int32)
+        elif feature_name == 'template_aatype':
+            if feature.shape[0] > 0:
+                feature = correct_template_restypes(feature)
+        elif feature_name == 'template_all_atom_masks':
+            feature_name = 'template_all_atom_mask'
+        elif feature_name == 'msa':
+            feature = feature.astype(np.uint8)
+
+        if feature_name.endswith('_mask'):
+            feature = feature.astype(np.float32)
+
+        converted[feature_name] = feature
+
+    if 'deletion_matrix_int' in monomer_features:
+        monomer_features['deletion_matrix'] = monomer_features.pop(
+            'deletion_matrix_int').astype(np.float32)
+
+    converted.pop(
+        'template_sum_probs'
+    )  # zy: this input is checked to be dirty in shape. TODO: figure out why and make it right.
+    return converted
+
+
+def int_id_to_str_id(num: int) -> str:
+    """Encodes a number as a string, using reverse spreadsheet style naming.
+
+    Args:
+        num: A positive integer.
+
+    Returns:
+        A string that encodes the positive integer using reverse spreadsheet style,
+        naming e.g. 1 = A, 2 = B, ..., 27 = AA, 28 = BA, 29 = CA, ... This is the
+        usual way to encode chain IDs in mmCIF files.
+    """
+    if num <= 0:
+        raise ValueError(f'Only positive integers allowed, got {num}.')
+
+    num = num - 1  # 1-based indexing.
+    output = []
+    while num >= 0:
+        output.append(chr(num % 26 + ord('A')))
+        num = num // 26 - 1
+    return ''.join(output)
+
+
+def add_assembly_features(all_chain_features, ):
+    """Add features to distinguish between chains.
+
+    Args:
+        all_chain_features: A dictionary which maps chain_id to a dictionary of
+            features for each chain.
+
+    Returns:
+        all_chain_features: A dictionary which maps strings of the form
+            `<seq_id>_<sym_id>` to the corresponding chain features. E.g. two
+            chains from a homodimer would have keys A_1 and A_2. Two chains from a
+            heterodimer would have keys A_1 and B_1.
+    """
+    # Group the chains by sequence
+    seq_to_entity_id = {}
+    grouped_chains = collections.defaultdict(list)
+    for chain_features in all_chain_features:
+        assert 'sequence' in chain_features
+        seq = str(chain_features['sequence'])
+        if seq not in seq_to_entity_id:
+            seq_to_entity_id[seq] = len(seq_to_entity_id) + 1
+        grouped_chains[seq_to_entity_id[seq]].append(chain_features)
+
+    new_all_chain_features = []
+    chain_id = 1
+    for entity_id, group_chain_features in grouped_chains.items():
+        num_sym = len(group_chain_features)  # zy
+        for sym_id, chain_features in enumerate(group_chain_features, start=1):
+            seq_length = chain_features['seq_length']
+            chain_features['asym_id'] = chain_id * np.ones(seq_length)
+            chain_features['sym_id'] = sym_id * np.ones(seq_length)
+            chain_features['entity_id'] = entity_id * np.ones(seq_length)
+            chain_features['num_sym'] = num_sym * np.ones(seq_length)
+            chain_id += 1
+            new_all_chain_features.append(chain_features)
+
+    return new_all_chain_features
+
+
+def pad_msa(np_example, min_num_seq):
+    np_example = dict(np_example)
+    num_seq = np_example['msa'].shape[0]
+    if num_seq < min_num_seq:
+        for feat in ('msa', 'deletion_matrix', 'bert_mask', 'msa_mask',
+                     'msa_chains'):
+            np_example[feat] = np.pad(np_example[feat],
+                                      ((0, min_num_seq - num_seq), (0, 0)))
+        np_example['cluster_bias_mask'] = np.pad(
+            np_example['cluster_bias_mask'], ((0, min_num_seq - num_seq), ))
+    return np_example
+
+
+def post_process(np_example):
+    np_example = pad_msa(np_example, 512)
+    no_dim_keys = [
+        'num_alignments',
+        'assembly_num_chains',
+        'num_templates',
+        'seq_length',
+        'resolution',
+    ]
+    for k in no_dim_keys:
+        if k in np_example:
+            np_example[k] = np_example[k].reshape(-1)
+    return np_example
+
+
+def merge_msas(msa, del_mat, new_msa, new_del_mat):
+    cur_msa_set = set([tuple(m) for m in msa])
+    new_rows = []
+    for i, s in enumerate(new_msa):
+        if tuple(s) not in cur_msa_set:
+            new_rows.append(i)
+    ret_msa = np.concatenate([msa, new_msa[new_rows]], axis=0)
+    ret_del_mat = np.concatenate([del_mat, new_del_mat[new_rows]], axis=0)
+    return ret_msa, ret_del_mat
diff --git a/modelscope/models/science/unifold/data/protein.py b/modelscope/models/science/unifold/data/protein.py
new file mode 100644
index 00000000..42308d04
--- /dev/null
+++ b/modelscope/models/science/unifold/data/protein.py
@@ -0,0 +1,322 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Protein data type."""
+import dataclasses
+import io
+from typing import Any, Mapping, Optional
+
+import numpy as np
+from Bio.PDB import PDBParser
+
+from modelscope.models.science.unifold.data import residue_constants
+
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+
+# Complete sequence of chain IDs supported by the PDB format.
+PDB_CHAIN_IDS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
+PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)  # := 62.
+
+
+@dataclasses.dataclass(frozen=True)
+class Protein:
+    """Protein structure representation."""
+
+    # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+    # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+    atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+
+    # Amino-acid type for each residue represented as an integer between 0 and
+    # 20, where 20 is 'X'.
+    aatype: np.ndarray  # [num_res]
+
+    # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+    # is present and 0.0 if not. This should be used for loss masking.
+    atom_mask: np.ndarray  # [num_res, num_atom_type]
+
+    # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+    residue_index: np.ndarray  # [num_res]
+
+    # 0-indexed number corresponding to the chain in the protein that this residue
+    # belongs to.
+    chain_index: np.ndarray  # [num_res]
+
+    # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+    # representing the displacement of the residue from its ground truth mean
+    # value.
+    b_factors: np.ndarray  # [num_res, num_atom_type]
+
+    def __post_init__(self):
+        if len(np.unique(self.chain_index)) > PDB_MAX_CHAINS:
+            raise ValueError(
+                f'Cannot build an instance with more than {PDB_MAX_CHAINS} chains '
+                'because these cannot be written to PDB format.')
+
+
+def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
+    """Takes a PDB string and constructs a Protein object.
+
+    WARNING: All non-standard residue types will be converted into UNK. All
+      non-standard atoms will be ignored.
+
+    Args:
+      pdb_str: The contents of the pdb file
+      chain_id: If chain_id is specified (e.g. A), then only that chain
+        is parsed. Otherwise all chains are parsed.
+
+    Returns:
+      A new `Protein` parsed from the pdb contents.
+    """
+    pdb_fh = io.StringIO(pdb_str)
+    parser = PDBParser(QUIET=True)
+    structure = parser.get_structure('none', pdb_fh)
+    models = list(structure.get_models())
+    if len(models) != 1:
+        raise ValueError(
+            f'Only single model PDBs are supported. Found {len(models)} models.'
+        )
+    model = models[0]
+
+    atom_positions = []
+    aatype = []
+    atom_mask = []
+    residue_index = []
+    chain_ids = []
+    b_factors = []
+
+    for chain in model:
+        if chain_id is not None and chain.id != chain_id:
+            continue
+        for res in chain:
+            if res.id[2] != ' ':
+                raise ValueError(
+                    f'PDB contains an insertion code at chain {chain.id} and residue '
+                    f'index {res.id[1]}. These are not supported.')
+            res_shortname = residue_constants.restype_3to1.get(
+                res.resname, 'X')
+            restype_idx = residue_constants.restype_order.get(
+                res_shortname, residue_constants.restype_num)
+            pos = np.zeros((residue_constants.atom_type_num, 3))
+            mask = np.zeros((residue_constants.atom_type_num, ))
+            res_b_factors = np.zeros((residue_constants.atom_type_num, ))
+            for atom in res:
+                if atom.name not in residue_constants.atom_types:
+                    continue
+                pos[residue_constants.atom_order[atom.name]] = atom.coord
+                mask[residue_constants.atom_order[atom.name]] = 1.0
+                res_b_factors[residue_constants.atom_order[
+                    atom.name]] = atom.bfactor
+            if np.sum(mask) < 0.5:
+                # If no known atom positions are reported for the residue then skip it.
+                continue
+            aatype.append(restype_idx)
+            atom_positions.append(pos)
+            atom_mask.append(mask)
+            residue_index.append(res.id[1])
+            chain_ids.append(chain.id)
+            b_factors.append(res_b_factors)
+
+    # Chain IDs are usually characters so map these to ints.
+    unique_chain_ids = np.unique(chain_ids)
+    chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
+    chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
+
+    return Protein(
+        atom_positions=np.array(atom_positions),
+        atom_mask=np.array(atom_mask),
+        aatype=np.array(aatype),
+        residue_index=np.array(residue_index),
+        chain_index=chain_index,
+        b_factors=np.array(b_factors),
+    )
+
+
+def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
+    chain_end = 'TER'
+    return (f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
+            f'{chain_name:>1}{residue_index:>4}')
+
+
+def to_pdb(prot: Protein) -> str:
+    """Converts a `Protein` instance to a PDB string.
+
+    Args:
+      prot: The protein to convert to PDB.
+
+    Returns:
+      PDB string.
+    """
+    restypes = residue_constants.restypes + ['X']
+
+    # res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
+    def res_1to3(r):
+        return residue_constants.restype_1to3.get(restypes[r], 'UNK')
+
+    atom_types = residue_constants.atom_types
+
+    pdb_lines = []
+
+    atom_mask = prot.atom_mask
+    aatype = prot.aatype
+    atom_positions = prot.atom_positions
+    residue_index = prot.residue_index.astype(np.int32)
+    chain_index = prot.chain_index.astype(np.int32)
+    b_factors = prot.b_factors
+
+    if np.any(aatype > residue_constants.restype_num):
+        raise ValueError('Invalid aatypes.')
+
+    # Construct a mapping from chain integer indices to chain ID strings.
+    chain_ids = {}
+    for i in np.unique(chain_index):  # np.unique gives sorted output.
+        if i >= PDB_MAX_CHAINS:
+            raise ValueError(
+                f'The PDB format supports at most {PDB_MAX_CHAINS} chains.')
+        chain_ids[i] = PDB_CHAIN_IDS[i]
+
+    pdb_lines.append('MODEL     1')
+    atom_index = 1
+    last_chain_index = chain_index[0]
+    # Add all atom sites.
+    for i in range(aatype.shape[0]):
+        # Close the previous chain if in a multichain PDB.
+        if last_chain_index != chain_index[i]:
+            pdb_lines.append(
+                _chain_end(
+                    atom_index,
+                    res_1to3(aatype[i - 1]),
+                    chain_ids[chain_index[i - 1]],
+                    residue_index[i - 1],
+                ))
+            last_chain_index = chain_index[i]
+            atom_index += 1  # Atom index increases at the TER symbol.
+
+        res_name_3 = res_1to3(aatype[i])
+        for atom_name, pos, mask, b_factor in zip(atom_types,
+                                                  atom_positions[i],
+                                                  atom_mask[i], b_factors[i]):
+            if mask < 0.5:
+                continue
+
+            record_type = 'ATOM'
+            name = atom_name if len(atom_name) == 4 else f' {atom_name}'
+            alt_loc = ''
+            insertion_code = ''
+            occupancy = 1.00
+            element = atom_name[
+                0]  # Protein supports only C, N, O, S, this works.
+            charge = ''
+            # PDB is a columnar format, every space matters here!
+            atom_line = (
+                f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
+                f'{res_name_3:>3} {chain_ids[chain_index[i]]:>1}'
+                f'{residue_index[i]:>4}{insertion_code:>1}   '
+                f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
+                f'{occupancy:>6.2f}{b_factor:>6.2f}          '
+                f'{element:>2}{charge:>2}')
+            pdb_lines.append(atom_line)
+            atom_index += 1
+
+    # Close the final chain.
+    pdb_lines.append(
+        _chain_end(
+            atom_index,
+            res_1to3(aatype[-1]),
+            chain_ids[chain_index[-1]],
+            residue_index[-1],
+        ))
+    pdb_lines.append('ENDMDL')
+    pdb_lines.append('END')
+
+    # Pad all lines to 80 characters.
+    pdb_lines = [line.ljust(80) for line in pdb_lines]
+    return '\n'.join(pdb_lines) + '\n'  # Add terminating newline.
+
+
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+    """Computes an ideal atom mask.
+
+    `Protein.atom_mask` typically is defined according to the atoms that are
+    reported in the PDB. This function computes a mask according to heavy atoms
+    that should be present in the given sequence of amino acids.
+
+    Args:
+      prot: `Protein` whose fields are `numpy.ndarray` objects.
+
+    Returns:
+      An ideal atom mask.
+    """
+    return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+
+
+def from_prediction(features: FeatureDict,
+                    result: ModelOutput,
+                    b_factors: Optional[np.ndarray] = None) -> Protein:
+    """Assembles a protein from a prediction.
+
+    Args:
+      features: Dictionary holding model inputs.
+      fold_output: Dictionary holding model outputs.
+      b_factors: (Optional) B-factors to use for the protein.
+
+    Returns:
+      A protein instance.
+    """
+
+    if 'asym_id' in features:
+        chain_index = features['asym_id'] - 1
+    else:
+        chain_index = np.zeros_like((features['aatype']))
+
+    if b_factors is None:
+        b_factors = np.zeros_like(result['final_atom_mask'])
+
+    return Protein(
+        aatype=features['aatype'],
+        atom_positions=result['final_atom_positions'],
+        atom_mask=result['final_atom_mask'],
+        residue_index=features['residue_index'] + 1,
+        chain_index=chain_index,
+        b_factors=b_factors,
+    )
+
+
+def from_feature(features: FeatureDict,
+                 b_factors: Optional[np.ndarray] = None) -> Protein:
+    """Assembles a standard pdb from input atom positions & mask.
+
+    Args:
+      features: Dictionary holding model inputs.
+      b_factors: (Optional) B-factors to use for the protein.
+
+    Returns:
+      A protein instance.
+    """
+
+    if 'asym_id' in features:
+        chain_index = features['asym_id'] - 1
+    else:
+        chain_index = np.zeros_like((features['aatype']))
+
+    if b_factors is None:
+        b_factors = np.zeros_like(features['all_atom_mask'])
+
+    return Protein(
+        aatype=features['aatype'],
+        atom_positions=features['all_atom_positions'],
+        atom_mask=features['all_atom_mask'],
+        residue_index=features['residue_index'] + 1,
+        chain_index=chain_index,
+        b_factors=b_factors,
+    )
diff --git a/modelscope/models/science/unifold/data/residue_constants.py b/modelscope/models/science/unifold/data/residue_constants.py
new file mode 100644
index 00000000..beebfe89
--- /dev/null
+++ b/modelscope/models/science/unifold/data/residue_constants.py
@@ -0,0 +1,1212 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Constants used in AlphaFold."""
+
+import collections
+import functools
+import os
+from typing import List, Mapping, Tuple
+
+import numpy as np
+from unicore.utils import tree_map
+
+# Distance from one CA to next CA [trans configuration: omega = 180].
+ca_ca = 3.80209737096
+
+# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
+# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
+# chi angles so their chi angle lists are empty.
+chi_angles_atoms = {
+    'ALA': [],
+    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
+    'ARG': [
+        ['N', 'CA', 'CB', 'CG'],
+        ['CA', 'CB', 'CG', 'CD'],
+        ['CB', 'CG', 'CD', 'NE'],
+        ['CG', 'CD', 'NE', 'CZ'],
+    ],
+    'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
+    'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
+    'CYS': [['N', 'CA', 'CB', 'SG']],
+    'GLN': [
+        ['N', 'CA', 'CB', 'CG'],
+        ['CA', 'CB', 'CG', 'CD'],
+        ['CB', 'CG', 'CD', 'OE1'],
+    ],
+    'GLU': [
+        ['N', 'CA', 'CB', 'CG'],
+        ['CA', 'CB', 'CG', 'CD'],
+        ['CB', 'CG', 'CD', 'OE1'],
+    ],
+    'GLY': [],
+    'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
+    'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
+    'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'LYS': [
+        ['N', 'CA', 'CB', 'CG'],
+        ['CA', 'CB', 'CG', 'CD'],
+        ['CB', 'CG', 'CD', 'CE'],
+        ['CG', 'CD', 'CE', 'NZ'],
+    ],
+    'MET': [
+        ['N', 'CA', 'CB', 'CG'],
+        ['CA', 'CB', 'CG', 'SD'],
+        ['CB', 'CG', 'SD', 'CE'],
+    ],
+    'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
+    'SER': [['N', 'CA', 'CB', 'OG']],
+    'THR': [['N', 'CA', 'CB', 'OG1']],
+    'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
+    'VAL': [['N', 'CA', 'CB', 'CG1']],
+}
+
+# If chi angles given in fixed-length array, this matrix determines how to mask
+# them for each AA type. The order is as per restype_order (see below).
+chi_angles_mask = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [1.0, 1.0, 1.0, 1.0],  # ARG
+    [1.0, 1.0, 0.0, 0.0],  # ASN
+    [1.0, 1.0, 0.0, 0.0],  # ASP
+    [1.0, 0.0, 0.0, 0.0],  # CYS
+    [1.0, 1.0, 1.0, 0.0],  # GLN
+    [1.0, 1.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [1.0, 1.0, 0.0, 0.0],  # HIS
+    [1.0, 1.0, 0.0, 0.0],  # ILE
+    [1.0, 1.0, 0.0, 0.0],  # LEU
+    [1.0, 1.0, 1.0, 1.0],  # LYS
+    [1.0, 1.0, 1.0, 0.0],  # MET
+    [1.0, 1.0, 0.0, 0.0],  # PHE
+    [1.0, 1.0, 0.0, 0.0],  # PRO
+    [1.0, 0.0, 0.0, 0.0],  # SER
+    [1.0, 0.0, 0.0, 0.0],  # THR
+    [1.0, 1.0, 0.0, 0.0],  # TRP
+    [1.0, 1.0, 0.0, 0.0],  # TYR
+    [1.0, 0.0, 0.0, 0.0],  # VAL
+]
+
+# The following chi angles are pi periodic: they can be rotated by a multiple
+# of pi without affecting the structure.
+chi_pi_periodic = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [0.0, 0.0, 0.0, 0.0],  # ARG
+    [0.0, 0.0, 0.0, 0.0],  # ASN
+    [0.0, 1.0, 0.0, 0.0],  # ASP
+    [0.0, 0.0, 0.0, 0.0],  # CYS
+    [0.0, 0.0, 0.0, 0.0],  # GLN
+    [0.0, 0.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [0.0, 0.0, 0.0, 0.0],  # HIS
+    [0.0, 0.0, 0.0, 0.0],  # ILE
+    [0.0, 0.0, 0.0, 0.0],  # LEU
+    [0.0, 0.0, 0.0, 0.0],  # LYS
+    [0.0, 0.0, 0.0, 0.0],  # MET
+    [0.0, 1.0, 0.0, 0.0],  # PHE
+    [0.0, 0.0, 0.0, 0.0],  # PRO
+    [0.0, 0.0, 0.0, 0.0],  # SER
+    [0.0, 0.0, 0.0, 0.0],  # THR
+    [0.0, 0.0, 0.0, 0.0],  # TRP
+    [0.0, 1.0, 0.0, 0.0],  # TYR
+    [0.0, 0.0, 0.0, 0.0],  # VAL
+    [0.0, 0.0, 0.0, 0.0],  # UNK
+]
+
+# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
+# psi and chi angles:
+# 0: 'backbone group',
+# 1: 'pre-omega-group', (empty)
+# 2: 'phi-group', (currently empty, because it defines only hydrogens)
+# 3: 'psi-group',
+# 4,5,6,7: 'chi1,2,3,4-group'
+# The atom positions are relative to the axis-end-atom of the corresponding
+# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
+# is defined such that the dihedral-angle-definiting atom (the last entry in
+# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
+# format: [atomname, group_idx, rel_position]
+rigid_group_atom_positions = {
+    'ALA': [
+        ['N', 0, (-0.525, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, -0.000, -0.000)],
+        ['CB', 0, (-0.529, -0.774, -1.205)],
+        ['O', 3, (0.627, 1.062, 0.000)],
+    ],
+    'ARG': [
+        ['N', 0, (-0.524, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, -0.000)],
+        ['CB', 0, (-0.524, -0.778, -1.209)],
+        ['O', 3, (0.626, 1.062, 0.000)],
+        ['CG', 4, (0.616, 1.390, -0.000)],
+        ['CD', 5, (0.564, 1.414, 0.000)],
+        ['NE', 6, (0.539, 1.357, -0.000)],
+        ['NH1', 7, (0.206, 2.301, 0.000)],
+        ['NH2', 7, (2.078, 0.978, -0.000)],
+        ['CZ', 7, (0.758, 1.093, -0.000)],
+    ],
+    'ASN': [
+        ['N', 0, (-0.536, 1.357, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, -0.000, -0.000)],
+        ['CB', 0, (-0.531, -0.787, -1.200)],
+        ['O', 3, (0.625, 1.062, 0.000)],
+        ['CG', 4, (0.584, 1.399, 0.000)],
+        ['ND2', 5, (0.593, -1.188, 0.001)],
+        ['OD1', 5, (0.633, 1.059, 0.000)],
+    ],
+    'ASP': [
+        ['N', 0, (-0.525, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, 0.000, -0.000)],
+        ['CB', 0, (-0.526, -0.778, -1.208)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.593, 1.398, -0.000)],
+        ['OD1', 5, (0.610, 1.091, 0.000)],
+        ['OD2', 5, (0.592, -1.101, -0.003)],
+    ],
+    'CYS': [
+        ['N', 0, (-0.522, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.524, 0.000, 0.000)],
+        ['CB', 0, (-0.519, -0.773, -1.212)],
+        ['O', 3, (0.625, 1.062, -0.000)],
+        ['SG', 4, (0.728, 1.653, 0.000)],
+    ],
+    'GLN': [
+        ['N', 0, (-0.526, 1.361, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, 0.000, 0.000)],
+        ['CB', 0, (-0.525, -0.779, -1.207)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.615, 1.393, 0.000)],
+        ['CD', 5, (0.587, 1.399, -0.000)],
+        ['NE2', 6, (0.593, -1.189, -0.001)],
+        ['OE1', 6, (0.634, 1.060, 0.000)],
+    ],
+    'GLU': [
+        ['N', 0, (-0.528, 1.361, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, -0.000, -0.000)],
+        ['CB', 0, (-0.526, -0.781, -1.207)],
+        ['O', 3, (0.626, 1.062, 0.000)],
+        ['CG', 4, (0.615, 1.392, 0.000)],
+        ['CD', 5, (0.600, 1.397, 0.000)],
+        ['OE1', 6, (0.607, 1.095, -0.000)],
+        ['OE2', 6, (0.589, -1.104, -0.001)],
+    ],
+    'GLY': [
+        ['N', 0, (-0.572, 1.337, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.517, -0.000, -0.000)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+    ],
+    'HIS': [
+        ['N', 0, (-0.527, 1.360, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, 0.000, 0.000)],
+        ['CB', 0, (-0.525, -0.778, -1.208)],
+        ['O', 3, (0.625, 1.063, 0.000)],
+        ['CG', 4, (0.600, 1.370, -0.000)],
+        ['CD2', 5, (0.889, -1.021, 0.003)],
+        ['ND1', 5, (0.744, 1.160, -0.000)],
+        ['CE1', 5, (2.030, 0.851, 0.002)],
+        ['NE2', 5, (2.145, -0.466, 0.004)],
+    ],
+    'ILE': [
+        ['N', 0, (-0.493, 1.373, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, -0.000, -0.000)],
+        ['CB', 0, (-0.536, -0.793, -1.213)],
+        ['O', 3, (0.627, 1.062, -0.000)],
+        ['CG1', 4, (0.534, 1.437, -0.000)],
+        ['CG2', 4, (0.540, -0.785, -1.199)],
+        ['CD1', 5, (0.619, 1.391, 0.000)],
+    ],
+    'LEU': [
+        ['N', 0, (-0.520, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, -0.000)],
+        ['CB', 0, (-0.522, -0.773, -1.214)],
+        ['O', 3, (0.625, 1.063, -0.000)],
+        ['CG', 4, (0.678, 1.371, 0.000)],
+        ['CD1', 5, (0.530, 1.430, -0.000)],
+        ['CD2', 5, (0.535, -0.774, 1.200)],
+    ],
+    'LYS': [
+        ['N', 0, (-0.526, 1.362, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, 0.000, 0.000)],
+        ['CB', 0, (-0.524, -0.778, -1.208)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.619, 1.390, 0.000)],
+        ['CD', 5, (0.559, 1.417, 0.000)],
+        ['CE', 6, (0.560, 1.416, 0.000)],
+        ['NZ', 7, (0.554, 1.387, 0.000)],
+    ],
+    'MET': [
+        ['N', 0, (-0.521, 1.364, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, 0.000, 0.000)],
+        ['CB', 0, (-0.523, -0.776, -1.210)],
+        ['O', 3, (0.625, 1.062, -0.000)],
+        ['CG', 4, (0.613, 1.391, -0.000)],
+        ['SD', 5, (0.703, 1.695, 0.000)],
+        ['CE', 6, (0.320, 1.786, -0.000)],
+    ],
+    'PHE': [
+        ['N', 0, (-0.518, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.524, 0.000, -0.000)],
+        ['CB', 0, (-0.525, -0.776, -1.212)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['CG', 4, (0.607, 1.377, 0.000)],
+        ['CD1', 5, (0.709, 1.195, -0.000)],
+        ['CD2', 5, (0.706, -1.196, 0.000)],
+        ['CE1', 5, (2.102, 1.198, -0.000)],
+        ['CE2', 5, (2.098, -1.201, -0.000)],
+        ['CZ', 5, (2.794, -0.003, -0.001)],
+    ],
+    'PRO': [
+        ['N', 0, (-0.566, 1.351, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, -0.000, 0.000)],
+        ['CB', 0, (-0.546, -0.611, -1.293)],
+        ['O', 3, (0.621, 1.066, 0.000)],
+        ['CG', 4, (0.382, 1.445, 0.0)],
+        # ['CD', 5, (0.427, 1.440, 0.0)],
+        ['CD', 5, (0.477, 1.424, 0.0)],  # manually made angle 2 degrees larger
+    ],
+    'SER': [
+        ['N', 0, (-0.529, 1.360, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, -0.000)],
+        ['CB', 0, (-0.518, -0.777, -1.211)],
+        ['O', 3, (0.626, 1.062, -0.000)],
+        ['OG', 4, (0.503, 1.325, 0.000)],
+    ],
+    'THR': [
+        ['N', 0, (-0.517, 1.364, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.526, 0.000, -0.000)],
+        ['CB', 0, (-0.516, -0.793, -1.215)],
+        ['O', 3, (0.626, 1.062, 0.000)],
+        ['CG2', 4, (0.550, -0.718, -1.228)],
+        ['OG1', 4, (0.472, 1.353, 0.000)],
+    ],
+    'TRP': [
+        ['N', 0, (-0.521, 1.363, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.525, -0.000, 0.000)],
+        ['CB', 0, (-0.523, -0.776, -1.212)],
+        ['O', 3, (0.627, 1.062, 0.000)],
+        ['CG', 4, (0.609, 1.370, -0.000)],
+        ['CD1', 5, (0.824, 1.091, 0.000)],
+        ['CD2', 5, (0.854, -1.148, -0.005)],
+        ['CE2', 5, (2.186, -0.678, -0.007)],
+        ['CE3', 5, (0.622, -2.530, -0.007)],
+        ['NE1', 5, (2.140, 0.690, -0.004)],
+        ['CH2', 5, (3.028, -2.890, -0.013)],
+        ['CZ2', 5, (3.283, -1.543, -0.011)],
+        ['CZ3', 5, (1.715, -3.389, -0.011)],
+    ],
+    'TYR': [
+        ['N', 0, (-0.522, 1.362, 0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.524, -0.000, -0.000)],
+        ['CB', 0, (-0.522, -0.776, -1.213)],
+        ['O', 3, (0.627, 1.062, -0.000)],
+        ['CG', 4, (0.607, 1.382, -0.000)],
+        ['CD1', 5, (0.716, 1.195, -0.000)],
+        ['CD2', 5, (0.713, -1.194, -0.001)],
+        ['CE1', 5, (2.107, 1.200, -0.002)],
+        ['CE2', 5, (2.104, -1.201, -0.003)],
+        ['OH', 5, (4.168, -0.002, -0.005)],
+        ['CZ', 5, (2.791, -0.001, -0.003)],
+    ],
+    'VAL': [
+        ['N', 0, (-0.494, 1.373, -0.000)],
+        ['CA', 0, (0.000, 0.000, 0.000)],
+        ['C', 0, (1.527, -0.000, -0.000)],
+        ['CB', 0, (-0.533, -0.795, -1.213)],
+        ['O', 3, (0.627, 1.062, -0.000)],
+        ['CG1', 4, (0.540, 1.429, -0.000)],
+        ['CG2', 4, (0.533, -0.776, 1.203)],
+    ],
+}
+
+# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
+residue_atoms = {
+    'ALA': ['C', 'CA', 'CB', 'N', 'O'],
+    'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
+    'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
+    'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
+    'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
+    'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
+    'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
+    'GLY': ['C', 'CA', 'N', 'O'],
+    'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
+    'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
+    'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
+    'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
+    'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
+    'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
+    'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
+    'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
+    'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
+    'TRP': [
+        'C',
+        'CA',
+        'CB',
+        'CG',
+        'CD1',
+        'CD2',
+        'CE2',
+        'CE3',
+        'CZ2',
+        'CZ3',
+        'CH2',
+        'N',
+        'NE1',
+        'O',
+    ],
+    'TYR':
+    ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O', 'OH'],
+    'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O'],
+}
+
+# Naming swaps for ambiguous atom names.
+# Due to symmetries in the amino acids the naming of atoms is ambiguous in
+# 4 of the 20 amino acids.
+# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
+# in LEU, VAL and ARG can be resolved by using the 3d constellations of
+# the 'ambiguous' atoms and their neighbours)
+residue_atom_renaming_swaps = {
+    'ASP': {
+        'OD1': 'OD2'
+    },
+    'GLU': {
+        'OE1': 'OE2'
+    },
+    'PHE': {
+        'CD1': 'CD2',
+        'CE1': 'CE2'
+    },
+    'TYR': {
+        'CD1': 'CD2',
+        'CE1': 'CE2'
+    },
+}
+
+# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
+van_der_waals_radius = {
+    'C': 1.7,
+    'N': 1.55,
+    'O': 1.52,
+    'S': 1.8,
+}
+
+Bond = collections.namedtuple('Bond',
+                              ['atom1_name', 'atom2_name', 'length', 'stddev'])
+BondAngle = collections.namedtuple(
+    'BondAngle',
+    ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])
+
+
+@functools.lru_cache(maxsize=None)
+# def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]], Mapping[  #noqa
+#     str, List[Bond]], Mapping[str, List[BondAngle]]]:
+def load_stereo_chemical_props():
+    """Load stereo_chemical_props.txt into a nice structure.
+
+    Load literature values for bond lengths and bond angles and translate
+    bond angles into the length of the opposite edge of the triangle
+    ("residue_virtual_bonds").
+
+    Returns:
+        residue_bonds: Dict that maps resname -> list of Bond tuples.
+        residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.
+        residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.
+    """
+    stereo_chemical_props_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'stereo_chemical_props.txt')
+    with open(stereo_chemical_props_path, 'rt') as f:
+        stereo_chemical_props = f.read()
+    lines_iter = iter(stereo_chemical_props.splitlines())
+    # Load bond lengths.
+    residue_bonds = {}
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == '-':
+            break
+        bond, resname, length, stddev = line.split()
+        atom1, atom2 = bond.split('-')
+        if resname not in residue_bonds:
+            residue_bonds[resname] = []
+        residue_bonds[resname].append(
+            Bond(atom1, atom2, float(length), float(stddev)))
+    residue_bonds['UNK'] = []
+
+    # Load bond angles.
+    residue_bond_angles = {}
+    next(lines_iter)  # Skip empty line.
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == '-':
+            break
+        bond, resname, angle_degree, stddev_degree = line.split()
+        atom1, atom2, atom3 = bond.split('-')
+        if resname not in residue_bond_angles:
+            residue_bond_angles[resname] = []
+        residue_bond_angles[resname].append(
+            BondAngle(
+                atom1,
+                atom2,
+                atom3,
+                float(angle_degree) / 180.0 * np.pi,
+                float(stddev_degree) / 180.0 * np.pi,
+            ))
+    residue_bond_angles['UNK'] = []
+
+    def make_bond_key(atom1_name, atom2_name):
+        """Unique key to lookup bonds."""
+        return '-'.join(sorted([atom1_name, atom2_name]))
+
+    # Translate bond angles into distances ("virtual bonds").
+    residue_virtual_bonds = {}
+    for resname, bond_angles in residue_bond_angles.items():
+        # Create a fast lookup dict for bond lengths.
+        bond_cache = {}
+        for b in residue_bonds[resname]:
+            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
+        residue_virtual_bonds[resname] = []
+        for ba in bond_angles:
+            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
+            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
+
+            # Compute distance between atom1 and atom3 using the law of cosines
+            # c^2 = a^2 + b^2 - 2ab*cos(gamma).
+            gamma = ba.angle_rad
+            length = np.sqrt(bond1.length**2 + bond2.length**2
+                             - 2 * bond1.length * bond2.length * np.cos(gamma))
+
+            # Propagation of uncertainty assuming uncorrelated errors.
+            dl_outer = 0.5 / length
+            dl_dgamma = (2 * bond1.length * bond2.length
+                         * np.sin(gamma)) * dl_outer
+            dl_db1 = (2 * bond1.length
+                      - 2 * bond2.length * np.cos(gamma)) * dl_outer
+            dl_db2 = (2 * bond2.length
+                      - 2 * bond1.length * np.cos(gamma)) * dl_outer
+            stddev = np.sqrt((dl_dgamma * ba.stddev)**2
+                             + (dl_db1 * bond1.stddev)**2
+                             + (dl_db2 * bond2.stddev)**2)
+            residue_virtual_bonds[resname].append(
+                Bond(ba.atom1_name, ba.atom3name, length, stddev))
+
+    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)
+
+
+# Between-residue bond lengths for general bonds (first element) and for Proline
+# (second element).
+between_res_bond_length_c_n = [1.329, 1.341]
+between_res_bond_length_stddev_c_n = [0.014, 0.016]
+
+# Between-residue cos_angles.
+between_res_cos_angles_c_n_ca = [-0.5203, 0.0353]  # degrees: 121.352 +- 2.315
+between_res_cos_angles_ca_c_n = [-0.4473, 0.0311]  # degrees: 116.568 +- 1.995
+
+# This mapping is used when we need to store atom data in a format that requires
+# fixed atom data size for every residue (e.g. a numpy array).
+atom_types = [
+    'N',
+    'CA',
+    'C',
+    'CB',
+    'O',
+    'CG',
+    'CG1',
+    'CG2',
+    'OG',
+    'OG1',
+    'SG',
+    'CD',
+    'CD1',
+    'CD2',
+    'ND1',
+    'ND2',
+    'OD1',
+    'OD2',
+    'SD',
+    'CE',
+    'CE1',
+    'CE2',
+    'CE3',
+    'NE',
+    'NE1',
+    'NE2',
+    'OE1',
+    'OE2',
+    'CH2',
+    'NH1',
+    'NH2',
+    'OH',
+    'CZ',
+    'CZ2',
+    'CZ3',
+    'NZ',
+    'OXT',
+]
+atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
+atom_type_num = len(atom_types)  # := 37.
+
+# A compact atom encoding with 14 columns
+# pylint: disable=line-too-long
+# pylint: disable=bad-whitespace
+restype_name_to_atom14_names = {
+    'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''],
+    'ARG': [
+        'N',
+        'CA',
+        'C',
+        'O',
+        'CB',
+        'CG',
+        'CD',
+        'NE',
+        'CZ',
+        'NH1',
+        'NH2',
+        '',
+        '',
+        '',
+    ],
+    'ASN':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''],
+    'ASP':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''],
+    'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''],
+    'GLN':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''],
+    'GLU':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', ''],
+    'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],
+    'HIS': [
+        'N',
+        'CA',
+        'C',
+        'O',
+        'CB',
+        'CG',
+        'ND1',
+        'CD2',
+        'CE1',
+        'NE2',
+        '',
+        '',
+        '',
+        '',
+    ],
+    'ILE':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', ''],
+    'LEU':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', ''],
+    'LYS':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', ''],
+    'MET':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', ''],
+    'PHE': [
+        'N',
+        'CA',
+        'C',
+        'O',
+        'CB',
+        'CG',
+        'CD1',
+        'CD2',
+        'CE1',
+        'CE2',
+        'CZ',
+        '',
+        '',
+        '',
+    ],
+    'PRO': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', ''],
+    'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''],
+    'THR':
+    ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', ''],
+    'TRP': [
+        'N',
+        'CA',
+        'C',
+        'O',
+        'CB',
+        'CG',
+        'CD1',
+        'CD2',
+        'NE1',
+        'CE2',
+        'CE3',
+        'CZ2',
+        'CZ3',
+        'CH2',
+    ],
+    'TYR': [
+        'N',
+        'CA',
+        'C',
+        'O',
+        'CB',
+        'CG',
+        'CD1',
+        'CD2',
+        'CE1',
+        'CE2',
+        'CZ',
+        'OH',
+        '',
+        '',
+    ],
+    'VAL':
+    ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''],
+    'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''],
+}
+# pylint: enable=line-too-long
+# pylint: enable=bad-whitespace
+
+# This is the standard residue order when coding AA type as a number.
+# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
+restypes = [
+    'A',
+    'R',
+    'N',
+    'D',
+    'C',
+    'Q',
+    'E',
+    'G',
+    'H',
+    'I',
+    'L',
+    'K',
+    'M',
+    'F',
+    'P',
+    'S',
+    'T',
+    'W',
+    'Y',
+    'V',
+]
+restype_order = {restype: i for i, restype in enumerate(restypes)}
+restype_num = len(restypes)  # := 20.
+unk_restype_index = restype_num  # Catch-all index for unknown restypes.
+
+restypes_with_x = restypes + ['X']
+restype_order_with_x = {
+    restype: i
+    for i, restype in enumerate(restypes_with_x)
+}
+
+
+def sequence_to_onehot(sequence: str,
+                       mapping: Mapping[str, int],
+                       map_unknown_to_x: bool = False) -> np.ndarray:
+    """Maps the given sequence into a one-hot encoded matrix.
+
+    Args:
+        sequence: An amino acid sequence.
+        mapping: A dictionary mapping amino acids to integers.
+        map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+            mapped to the unknown amino acid 'X'. If the mapping doesn't contain
+            amino acid 'X', an error will be thrown. If False, any amino acid not in
+            the mapping will throw an error.
+
+    Returns:
+        A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
+        the sequence.
+
+    Raises:
+        ValueError: If the mapping doesn't contain values from 0 to
+            num_unique_aas - 1 without any gaps.
+    """
+    num_entries = max(mapping.values()) + 1
+
+    if sorted(set(mapping.values())) != list(range(num_entries)):
+        raise ValueError(
+            'The mapping must have values from 0 to num_unique_aas-1 '
+            'without any gaps. Got: %s' % sorted(mapping.values()))
+
+    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
+
+    for aa_index, aa_type in enumerate(sequence):
+        if map_unknown_to_x:
+            if aa_type.isalpha() and aa_type.isupper():
+                aa_id = mapping.get(aa_type, mapping['X'])
+            else:
+                raise ValueError(
+                    f'Invalid character in the sequence: {aa_type}')
+        else:
+            aa_id = mapping[aa_type]
+        one_hot_arr[aa_index, aa_id] = 1
+
+    return one_hot_arr
+
+
+restype_1to3 = {
+    'A': 'ALA',
+    'R': 'ARG',
+    'N': 'ASN',
+    'D': 'ASP',
+    'C': 'CYS',
+    'Q': 'GLN',
+    'E': 'GLU',
+    'G': 'GLY',
+    'H': 'HIS',
+    'I': 'ILE',
+    'L': 'LEU',
+    'K': 'LYS',
+    'M': 'MET',
+    'F': 'PHE',
+    'P': 'PRO',
+    'S': 'SER',
+    'T': 'THR',
+    'W': 'TRP',
+    'Y': 'TYR',
+    'V': 'VAL',
+}
+
+# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
+# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
+# many more, and less common, three letter names as keys and maps many of these
+# to the same one letter name (including 'X' and 'U' which we don't use here).
+restype_3to1 = {v: k for k, v in restype_1to3.items()}
+
+# Define a restype name for all unknown residues.
+unk_restype = 'UNK'
+
+resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
+resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
+
+# The mapping here uses hhblits convention, so that B is mapped to D, J and O
+# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
+# remaining 20 amino acids are kept in alphabetical order.
+# There are 2 non-amino acid codes, X (representing any amino acid) and
+# "-" representing a missing amino acid in an alignment.    The id for these
+# codes is put at the end (20 and 21) so that they can easily be ignored if
+# desired.
+HHBLITS_AA_TO_ID = {
+    'A': 0,
+    'B': 2,
+    'C': 1,
+    'D': 2,
+    'E': 3,
+    'F': 4,
+    'G': 5,
+    'H': 6,
+    'I': 7,
+    'J': 20,
+    'K': 8,
+    'L': 9,
+    'M': 10,
+    'N': 11,
+    'O': 20,
+    'P': 12,
+    'Q': 13,
+    'R': 14,
+    'S': 15,
+    'T': 16,
+    'U': 1,
+    'V': 17,
+    'W': 18,
+    'X': 20,
+    'Y': 19,
+    'Z': 3,
+    '-': 21,
+}
+
+# Partial inversion of HHBLITS_AA_TO_ID.
+ID_TO_HHBLITS_AA = {
+    0: 'A',
+    1: 'C',  # Also U.
+    2: 'D',  # Also B.
+    3: 'E',  # Also Z.
+    4: 'F',
+    5: 'G',
+    6: 'H',
+    7: 'I',
+    8: 'K',
+    9: 'L',
+    10: 'M',
+    11: 'N',
+    12: 'P',
+    13: 'Q',
+    14: 'R',
+    15: 'S',
+    16: 'T',
+    17: 'V',
+    18: 'W',
+    19: 'Y',
+    20: 'X',  # Includes J and O.
+    21: '-',
+}
+
+restypes_with_x_and_gap = restypes + ['X', '-']
+MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
+    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
+    for i in range(len(restypes_with_x_and_gap)))
+
+
+def _make_standard_atom_mask() -> np.ndarray:
+    """Returns [num_res_types, num_atom_types] mask array."""
+    # +1 to account for unknown (all 0s).
+    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
+    for restype, restype_letter in enumerate(restypes):
+        restype_name = restype_1to3[restype_letter]
+        atom_names = residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = atom_order[atom_name]
+            mask[restype, atom_type] = 1
+    return mask
+
+
+STANDARD_ATOM_MASK = _make_standard_atom_mask()
+
+
+# A one hot representation for the first and second atoms defining the axis
+# of rotation for each chi-angle in each residue.
+def chi_angle_atom(atom_index: int) -> np.ndarray:
+    """Define chi-angle rigid groups via one-hot representations."""
+    chi_angles_index = {}
+    one_hots = []
+
+    for k, v in chi_angles_atoms.items():
+        indices = [atom_types.index(s[atom_index]) for s in v]
+        indices.extend([-1] * (4 - len(indices)))
+        chi_angles_index[k] = indices
+
+    for r in restypes:
+        res3 = restype_1to3[r]
+        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
+        one_hots.append(one_hot)
+
+    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
+    one_hot = np.stack(one_hots, axis=0)
+    one_hot = np.transpose(one_hot, [0, 2, 1])
+
+    return one_hot
+
+
+chi_atom_1_one_hot = chi_angle_atom(1)
+chi_atom_2_one_hot = chi_angle_atom(2)
+
+# An array like chi_angles_atoms but using indices rather than names.
+chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
+chi_angles_atom_indices = tree_map(
+    lambda n: atom_order[n], chi_angles_atom_indices, leaf_type=str)
+chi_angles_atom_indices = np.array([
+    chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
+    for chi_atoms in chi_angles_atom_indices
+])
+
+# Mapping from (res_name, atom_name) pairs to the atom's chi group index
+# and atom index within that group.
+chi_groups_for_atom = collections.defaultdict(list)
+for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
+    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
+        for atom_i, atom in enumerate(chi_group):
+            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
+chi_groups_for_atom = dict(chi_groups_for_atom)
+
+
+def _make_rigid_transformation_4x4(ex, ey, translation):
+    """Create a rigid 4x4 transformation matrix from two axes and transl."""
+    # Normalize ex.
+    ex_normalized = ex / np.linalg.norm(ex)
+
+    # make ey perpendicular to ex
+    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
+    ey_normalized /= np.linalg.norm(ey_normalized)
+
+    # compute ez as cross product
+    eznorm = np.cross(ex_normalized, ey_normalized)
+    m = np.stack([ex_normalized, ey_normalized, eznorm,
+                  translation]).transpose()
+    m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0)
+    return m
+
+
+# create an array with (restype, atomtype) --> rigid_group_idx
+# and an array with (restype, atomtype, coord) for the atom positions
+# and compute affine transformation matrices (4,4) from one rigid group to the
+# previous group
+restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int_)
+restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
+restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int_)
+restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
+restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
+restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
+
+
+def _make_rigid_group_constants():
+    """Fill the arrays above."""
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        for atomname, group_idx, atom_position in rigid_group_atom_positions[
+                resname]:
+            atomtype = atom_order[atomname]
+            restype_atom37_to_rigid_group[restype, atomtype] = group_idx
+            restype_atom37_mask[restype, atomtype] = 1
+            restype_atom37_rigid_group_positions[restype,
+                                                 atomtype, :] = atom_position
+
+            atom14idx = restype_name_to_atom14_names[resname].index(atomname)
+            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
+            restype_atom14_mask[restype, atom14idx] = 1
+            restype_atom14_rigid_group_positions[restype,
+                                                 atom14idx, :] = atom_position
+
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_positions = {
+            name: np.array(pos)
+            for name, _, pos in rigid_group_atom_positions[resname]
+        }
+
+        # backbone to backbone is the identity transform
+        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
+
+        # pre-omega-frame to backbone (currently dummy identity matrix)
+        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
+
+        # phi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions['N'] - atom_positions['CA'],
+            ey=np.array([1.0, 0.0, 0.0]),
+            translation=atom_positions['N'],
+        )
+        restype_rigid_group_default_frame[restype, 2, :, :] = mat
+
+        # psi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions['C'] - atom_positions['CA'],
+            ey=atom_positions['CA'] - atom_positions['N'],
+            translation=atom_positions['C'],
+        )
+        restype_rigid_group_default_frame[restype, 3, :, :] = mat
+
+        # chi1-frame to backbone
+        if chi_angles_mask[restype][0]:
+            base_atom_names = chi_angles_atoms[resname][0]
+            base_atom_positions = [
+                atom_positions[name] for name in base_atom_names
+            ]
+            mat = _make_rigid_transformation_4x4(
+                ex=base_atom_positions[2] - base_atom_positions[1],
+                ey=base_atom_positions[0] - base_atom_positions[1],
+                translation=base_atom_positions[2],
+            )
+            restype_rigid_group_default_frame[restype, 4, :, :] = mat
+
+        # chi2-frame to chi1-frame
+        # chi3-frame to chi2-frame
+        # chi4-frame to chi3-frame
+        # luckily all rotation axes for the next frame start at (0,0,0) of the
+        # previous frame
+        for chi_idx in range(1, 4):
+            if chi_angles_mask[restype][chi_idx]:
+                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
+                axis_end_atom_position = atom_positions[axis_end_atom_name]
+                mat = _make_rigid_transformation_4x4(
+                    ex=axis_end_atom_position,
+                    ey=np.array([-1.0, 0.0, 0.0]),
+                    translation=axis_end_atom_position,
+                )
+                restype_rigid_group_default_frame[restype,
+                                                  4 + chi_idx, :, :] = mat
+
+
+_make_rigid_group_constants()
+
+
+def make_atom14_dists_bounds(overlap_tolerance=1.5,
+                             bond_length_tolerance_factor=15):
+    """compute upper and lower bounds for bonds to assess violations."""
+    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
+    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_list = restype_name_to_atom14_names[resname]
+
+        # create lower and upper bounds for clashes
+        for atom1_idx, atom1_name in enumerate(atom_list):
+            if not atom1_name:
+                continue
+            atom1_radius = van_der_waals_radius[atom1_name[0]]
+            for atom2_idx, atom2_name in enumerate(atom_list):
+                if (not atom2_name) or atom1_idx == atom2_idx:
+                    continue
+                atom2_radius = van_der_waals_radius[atom2_name[0]]
+                lower = atom1_radius + atom2_radius - overlap_tolerance
+                upper = 1e10
+                restype_atom14_bond_lower_bound[restype, atom1_idx,
+                                                atom2_idx] = lower
+                restype_atom14_bond_lower_bound[restype, atom2_idx,
+                                                atom1_idx] = lower
+                restype_atom14_bond_upper_bound[restype, atom1_idx,
+                                                atom2_idx] = upper
+                restype_atom14_bond_upper_bound[restype, atom2_idx,
+                                                atom1_idx] = upper
+
+        # overwrite lower and upper bounds for bonds and angles
+        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
+            atom1_idx = atom_list.index(b.atom1_name)
+            atom2_idx = atom_list.index(b.atom2_name)
+            lower = b.length - bond_length_tolerance_factor * b.stddev
+            upper = b.length + bond_length_tolerance_factor * b.stddev
+            restype_atom14_bond_lower_bound[restype, atom1_idx,
+                                            atom2_idx] = lower
+            restype_atom14_bond_lower_bound[restype, atom2_idx,
+                                            atom1_idx] = lower
+            restype_atom14_bond_upper_bound[restype, atom1_idx,
+                                            atom2_idx] = upper
+            restype_atom14_bond_upper_bound[restype, atom2_idx,
+                                            atom1_idx] = upper
+            restype_atom14_bond_stddev[restype, atom1_idx,
+                                       atom2_idx] = b.stddev
+            restype_atom14_bond_stddev[restype, atom2_idx,
+                                       atom1_idx] = b.stddev
+    return {
+        'lower_bound': restype_atom14_bond_lower_bound,  # shape (21,14,14)
+        'upper_bound': restype_atom14_bond_upper_bound,  # shape (21,14,14)
+        'stddev': restype_atom14_bond_stddev,  # shape (21,14,14)
+    }
+
+
+def _make_atom14_and_atom37_constants():
+    restype_atom14_to_atom37 = []
+    restype_atom37_to_atom14 = []
+    restype_atom14_mask = []
+
+    for rt in restypes:
+        atom_names = restype_name_to_atom14_names[restype_1to3[rt]]
+        restype_atom14_to_atom37.append([(atom_order[name] if name else 0)
+                                         for name in atom_names])
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14.append([
+            (atom_name_to_idx14[name] if name in atom_name_to_idx14 else 0)
+            for name in atom_types
+        ])
+
+        restype_atom14_mask.append([(1.0 if name else 0.0)
+                                    for name in atom_names])
+
+    # Add dummy mapping for restype 'UNK'
+    restype_atom14_to_atom37.append([0] * 14)
+    restype_atom37_to_atom14.append([0] * 37)
+    restype_atom14_mask.append([0.0] * 14)
+
+    restype_atom14_to_atom37 = np.array(
+        restype_atom14_to_atom37, dtype=np.int32)
+    restype_atom37_to_atom14 = np.array(
+        restype_atom37_to_atom14, dtype=np.int32)
+    restype_atom14_mask = np.array(restype_atom14_mask, dtype=np.float32)
+
+    return restype_atom14_to_atom37, restype_atom37_to_atom14, restype_atom14_mask
+
+
+(
+    restype_atom14_to_atom37,
+    restype_atom37_to_atom14,
+    restype_atom14_mask,
+) = _make_atom14_and_atom37_constants()
+
+
+def _make_renaming_matrices():
+    # As the atom naming is ambiguous for 7 of the 20 amino acids, provide
+    # alternative ground truth coordinates where the naming is swapped
+    restype_3 = [restype_1to3[res] for res in restypes]
+    restype_3 += ['UNK']
+
+    # Matrices for renaming ambiguous atoms.
+    all_matrices = {res: np.eye(14) for res in restype_3}
+    for resname, swap in residue_atom_renaming_swaps.items():
+        correspondences = np.arange(14)
+        for source_atom_swap, target_atom_swap in swap.items():
+            source_index = restype_name_to_atom14_names[resname].index(
+                source_atom_swap)
+            target_index = restype_name_to_atom14_names[resname].index(
+                target_atom_swap)
+            correspondences[source_index] = target_index
+            correspondences[target_index] = source_index
+            renaming_matrix = np.zeros((14, 14))
+            for index, correspondence in enumerate(correspondences):
+                renaming_matrix[index, correspondence] = 1.0
+        all_matrices[resname] = renaming_matrix
+    renaming_matrices = np.stack(
+        [all_matrices[restype] for restype in restype_3])
+    return renaming_matrices
+
+
+renaming_matrices = _make_renaming_matrices()
+
+
+def _make_atom14_is_ambiguous():
+    # Create an ambiguous atoms mask.    shape: (21, 14).
+    restype_atom14_is_ambiguous = np.zeros((21, 14))
+    for resname, swap in residue_atom_renaming_swaps.items():
+        for atom_name1, atom_name2 in swap.items():
+            restype = restype_order[restype_3to1[resname]]
+            atom_idx1 = restype_name_to_atom14_names[resname].index(atom_name1)
+            atom_idx2 = restype_name_to_atom14_names[resname].index(atom_name2)
+            restype_atom14_is_ambiguous[restype, atom_idx1] = 1
+            restype_atom14_is_ambiguous[restype, atom_idx2] = 1
+    return restype_atom14_is_ambiguous
+
+
+restype_atom14_is_ambiguous = _make_atom14_is_ambiguous()
+
+
+def get_chi_atom_indices():
+    """Returns atom indices needed to compute chi angles for all residue types.
+
+    Returns:
+      A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are
+      in the order specified in restypes + unknown residue type
+      at the end. For chi angles which are not defined on the residue, the
+      positions indices are by default set to 0.
+    """
+    chi_atom_indices = []
+    for residue_name in restypes:
+        residue_name = restype_1to3[residue_name]
+        residue_chi_angles = chi_angles_atoms[residue_name]
+        atom_indices = []
+        for chi_angle in residue_chi_angles:
+            atom_indices.append([atom_order[atom] for atom in chi_angle])
+        for _ in range(4 - len(atom_indices)):
+            atom_indices.append([0, 0, 0,
+                                 0])  # For chi angles not defined on the AA.
+        chi_atom_indices.append(atom_indices)
+
+    chi_atom_indices.append([[0, 0, 0, 0]] * 4)  # For UNKNOWN residue.
+
+    return chi_atom_indices
+
+
+chi_atom_indices = get_chi_atom_indices()
diff --git a/modelscope/models/science/unifold/data/stereo_chemical_props.txt b/modelscope/models/science/unifold/data/stereo_chemical_props.txt
new file mode 100644
index 00000000..25262efd
--- /dev/null
+++ b/modelscope/models/science/unifold/data/stereo_chemical_props.txt
@@ -0,0 +1,345 @@
+Bond			Residue		Mean		StdDev
+CA-CB			ALA		1.520		0.021
+N-CA			ALA		1.459		0.020
+CA-C			ALA		1.525		0.026
+C-O			ALA		1.229		0.019
+CA-CB			ARG		1.535		0.022
+CB-CG			ARG		1.521		0.027
+CG-CD			ARG		1.515		0.025
+CD-NE			ARG		1.460		0.017
+NE-CZ			ARG		1.326		0.013
+CZ-NH1			ARG		1.326		0.013
+CZ-NH2			ARG		1.326		0.013
+N-CA			ARG		1.459		0.020
+CA-C			ARG		1.525		0.026
+C-O			ARG		1.229		0.019
+CA-CB			ASN		1.527		0.026
+CB-CG			ASN		1.506		0.023
+CG-OD1			ASN		1.235		0.022
+CG-ND2			ASN		1.324		0.025
+N-CA			ASN		1.459		0.020
+CA-C			ASN		1.525		0.026
+C-O			ASN		1.229		0.019
+CA-CB			ASP		1.535		0.022
+CB-CG			ASP		1.513		0.021
+CG-OD1			ASP		1.249		0.023
+CG-OD2			ASP		1.249		0.023
+N-CA			ASP		1.459		0.020
+CA-C			ASP		1.525		0.026
+C-O			ASP		1.229		0.019
+CA-CB			CYS		1.526		0.013
+CB-SG			CYS		1.812		0.016
+N-CA			CYS		1.459		0.020
+CA-C			CYS		1.525		0.026
+C-O			CYS		1.229		0.019
+CA-CB			GLU		1.535		0.022
+CB-CG			GLU		1.517		0.019
+CG-CD			GLU		1.515		0.015
+CD-OE1			GLU		1.252		0.011
+CD-OE2			GLU		1.252		0.011
+N-CA			GLU		1.459		0.020
+CA-C			GLU		1.525		0.026
+C-O			GLU		1.229		0.019
+CA-CB			GLN		1.535		0.022
+CB-CG			GLN		1.521		0.027
+CG-CD			GLN		1.506		0.023
+CD-OE1			GLN		1.235		0.022
+CD-NE2			GLN		1.324		0.025
+N-CA			GLN		1.459		0.020
+CA-C			GLN		1.525		0.026
+C-O			GLN		1.229		0.019
+N-CA			GLY		1.456		0.015
+CA-C			GLY		1.514		0.016
+C-O			GLY		1.232		0.016
+CA-CB			HIS		1.535		0.022
+CB-CG			HIS		1.492		0.016
+CG-ND1			HIS		1.369		0.015
+CG-CD2			HIS		1.353		0.017
+ND1-CE1			HIS		1.343		0.025
+CD2-NE2			HIS		1.415		0.021
+CE1-NE2			HIS		1.322		0.023
+N-CA			HIS		1.459		0.020
+CA-C			HIS		1.525		0.026
+C-O			HIS		1.229		0.019
+CA-CB			ILE		1.544		0.023
+CB-CG1			ILE		1.536		0.028
+CB-CG2			ILE		1.524		0.031
+CG1-CD1			ILE		1.500		0.069
+N-CA			ILE		1.459		0.020
+CA-C			ILE		1.525		0.026
+C-O			ILE		1.229		0.019
+CA-CB			LEU		1.533		0.023
+CB-CG			LEU		1.521		0.029
+CG-CD1			LEU		1.514		0.037
+CG-CD2			LEU		1.514		0.037
+N-CA			LEU		1.459		0.020
+CA-C			LEU		1.525		0.026
+C-O			LEU		1.229		0.019
+CA-CB			LYS		1.535		0.022
+CB-CG			LYS		1.521		0.027
+CG-CD			LYS		1.520		0.034
+CD-CE			LYS		1.508		0.025
+CE-NZ			LYS		1.486		0.025
+N-CA			LYS		1.459		0.020
+CA-C			LYS		1.525		0.026
+C-O			LYS		1.229		0.019
+CA-CB			MET		1.535		0.022
+CB-CG			MET		1.509		0.032
+CG-SD			MET		1.807		0.026
+SD-CE			MET		1.774		0.056
+N-CA			MET		1.459		0.020
+CA-C			MET		1.525		0.026
+C-O			MET		1.229		0.019
+CA-CB			PHE		1.535		0.022
+CB-CG			PHE		1.509		0.017
+CG-CD1			PHE		1.383		0.015
+CG-CD2			PHE		1.383		0.015
+CD1-CE1			PHE		1.388		0.020
+CD2-CE2			PHE		1.388		0.020
+CE1-CZ			PHE		1.369		0.019
+CE2-CZ			PHE		1.369		0.019
+N-CA			PHE		1.459		0.020
+CA-C			PHE		1.525		0.026
+C-O			PHE		1.229		0.019
+CA-CB			PRO		1.531		0.020
+CB-CG			PRO		1.495		0.050
+CG-CD			PRO		1.502		0.033
+CD-N			PRO		1.474		0.014
+N-CA			PRO		1.468		0.017
+CA-C			PRO		1.524		0.020
+C-O			PRO		1.228		0.020
+CA-CB			SER		1.525		0.015
+CB-OG			SER		1.418		0.013
+N-CA			SER		1.459		0.020
+CA-C			SER		1.525		0.026
+C-O			SER		1.229		0.019
+CA-CB			THR		1.529		0.026
+CB-OG1			THR		1.428		0.020
+CB-CG2			THR		1.519		0.033
+N-CA			THR		1.459		0.020
+CA-C			THR		1.525		0.026
+C-O			THR		1.229		0.019
+CA-CB			TRP		1.535		0.022
+CB-CG			TRP		1.498		0.018
+CG-CD1			TRP		1.363		0.014
+CG-CD2			TRP		1.432		0.017
+CD1-NE1			TRP		1.375		0.017
+NE1-CE2			TRP		1.371		0.013
+CD2-CE2			TRP		1.409		0.012
+CD2-CE3			TRP		1.399		0.015
+CE2-CZ2			TRP		1.393		0.017
+CE3-CZ3			TRP		1.380		0.017
+CZ2-CH2			TRP		1.369		0.019
+CZ3-CH2			TRP		1.396		0.016
+N-CA			TRP		1.459		0.020
+CA-C			TRP		1.525		0.026
+C-O			TRP		1.229		0.019
+CA-CB			TYR		1.535		0.022
+CB-CG			TYR		1.512		0.015
+CG-CD1			TYR		1.387		0.013
+CG-CD2			TYR		1.387		0.013
+CD1-CE1			TYR		1.389		0.015
+CD2-CE2			TYR		1.389		0.015
+CE1-CZ			TYR		1.381		0.013
+CE2-CZ			TYR		1.381		0.013
+CZ-OH			TYR		1.374		0.017
+N-CA			TYR		1.459		0.020
+CA-C			TYR		1.525		0.026
+C-O			TYR		1.229		0.019
+CA-CB			VAL		1.543		0.021
+CB-CG1			VAL		1.524		0.021
+CB-CG2			VAL		1.524		0.021
+N-CA			VAL		1.459		0.020
+CA-C			VAL		1.525		0.026
+C-O			VAL		1.229		0.019
+-
+
+Angle			Residue		Mean		StdDev
+N-CA-CB			ALA		110.1		1.4
+CB-CA-C			ALA		110.1		1.5
+N-CA-C			ALA		111.0		2.7
+CA-C-O			ALA		120.1		2.1
+N-CA-CB			ARG		110.6		1.8
+CB-CA-C			ARG		110.4		2.0
+CA-CB-CG		ARG		113.4		2.2
+CB-CG-CD		ARG		111.6		2.6
+CG-CD-NE		ARG		111.8		2.1
+CD-NE-CZ		ARG		123.6		1.4
+NE-CZ-NH1		ARG		120.3		0.5
+NE-CZ-NH2		ARG		120.3		0.5
+NH1-CZ-NH2		ARG		119.4		1.1
+N-CA-C			ARG		111.0		2.7
+CA-C-O			ARG		120.1		2.1
+N-CA-CB			ASN		110.6		1.8
+CB-CA-C			ASN		110.4		2.0
+CA-CB-CG		ASN		113.4		2.2
+CB-CG-ND2		ASN		116.7		2.4
+CB-CG-OD1		ASN		121.6		2.0
+ND2-CG-OD1		ASN		121.9		2.3
+N-CA-C			ASN		111.0		2.7
+CA-C-O			ASN		120.1		2.1
+N-CA-CB			ASP		110.6		1.8
+CB-CA-C			ASP		110.4		2.0
+CA-CB-CG		ASP		113.4		2.2
+CB-CG-OD1		ASP		118.3		0.9
+CB-CG-OD2		ASP		118.3		0.9
+OD1-CG-OD2		ASP		123.3		1.9
+N-CA-C			ASP		111.0		2.7
+CA-C-O			ASP		120.1		2.1
+N-CA-CB			CYS		110.8		1.5
+CB-CA-C			CYS		111.5		1.2
+CA-CB-SG		CYS		114.2		1.1
+N-CA-C			CYS		111.0		2.7
+CA-C-O			CYS		120.1		2.1
+N-CA-CB			GLU		110.6		1.8
+CB-CA-C			GLU		110.4		2.0
+CA-CB-CG		GLU		113.4		2.2
+CB-CG-CD		GLU		114.2		2.7
+CG-CD-OE1		GLU		118.3		2.0
+CG-CD-OE2		GLU		118.3		2.0
+OE1-CD-OE2		GLU		123.3		1.2
+N-CA-C			GLU		111.0		2.7
+CA-C-O			GLU		120.1		2.1
+N-CA-CB			GLN		110.6		1.8
+CB-CA-C			GLN		110.4		2.0
+CA-CB-CG		GLN		113.4		2.2
+CB-CG-CD		GLN		111.6		2.6
+CG-CD-OE1		GLN		121.6		2.0
+CG-CD-NE2		GLN		116.7		2.4
+OE1-CD-NE2		GLN		121.9		2.3
+N-CA-C			GLN		111.0		2.7
+CA-C-O			GLN		120.1		2.1
+N-CA-C			GLY		113.1		2.5
+CA-C-O			GLY		120.6		1.8
+N-CA-CB			HIS		110.6		1.8
+CB-CA-C			HIS		110.4		2.0
+CA-CB-CG		HIS		113.6		1.7
+CB-CG-ND1		HIS		123.2		2.5
+CB-CG-CD2		HIS		130.8		3.1
+CG-ND1-CE1		HIS		108.2		1.4
+ND1-CE1-NE2		HIS		109.9		2.2
+CE1-NE2-CD2		HIS		106.6		2.5
+NE2-CD2-CG		HIS		109.2		1.9
+CD2-CG-ND1		HIS		106.0		1.4
+N-CA-C			HIS		111.0		2.7
+CA-C-O			HIS		120.1		2.1
+N-CA-CB			ILE		110.8		2.3
+CB-CA-C			ILE		111.6		2.0
+CA-CB-CG1		ILE		111.0		1.9
+CB-CG1-CD1		ILE		113.9		2.8
+CA-CB-CG2		ILE		110.9		2.0
+CG1-CB-CG2		ILE		111.4		2.2
+N-CA-C			ILE		111.0		2.7
+CA-C-O			ILE		120.1		2.1
+N-CA-CB			LEU		110.4		2.0
+CB-CA-C			LEU		110.2		1.9
+CA-CB-CG		LEU		115.3		2.3
+CB-CG-CD1		LEU		111.0		1.7
+CB-CG-CD2		LEU		111.0		1.7
+CD1-CG-CD2		LEU		110.5		3.0
+N-CA-C			LEU		111.0		2.7
+CA-C-O			LEU		120.1		2.1
+N-CA-CB			LYS		110.6		1.8
+CB-CA-C			LYS		110.4		2.0
+CA-CB-CG		LYS		113.4		2.2
+CB-CG-CD		LYS		111.6		2.6
+CG-CD-CE		LYS		111.9		3.0
+CD-CE-NZ		LYS		111.7		2.3
+N-CA-C			LYS		111.0		2.7
+CA-C-O			LYS		120.1		2.1
+N-CA-CB			MET		110.6		1.8
+CB-CA-C			MET		110.4		2.0
+CA-CB-CG		MET		113.3		1.7
+CB-CG-SD		MET		112.4		3.0
+CG-SD-CE		MET		100.2		1.6
+N-CA-C			MET		111.0		2.7
+CA-C-O			MET		120.1		2.1
+N-CA-CB			PHE		110.6		1.8
+CB-CA-C			PHE		110.4		2.0
+CA-CB-CG		PHE		113.9		2.4
+CB-CG-CD1		PHE		120.8		0.7
+CB-CG-CD2		PHE		120.8		0.7
+CD1-CG-CD2		PHE		118.3		1.3
+CG-CD1-CE1		PHE		120.8		1.1
+CG-CD2-CE2		PHE		120.8		1.1
+CD1-CE1-CZ		PHE		120.1		1.2
+CD2-CE2-CZ		PHE		120.1		1.2
+CE1-CZ-CE2		PHE		120.0		1.8
+N-CA-C			PHE		111.0		2.7
+CA-C-O			PHE		120.1		2.1
+N-CA-CB			PRO		103.3		1.2
+CB-CA-C			PRO		111.7		2.1
+CA-CB-CG		PRO		104.8		1.9
+CB-CG-CD		PRO		106.5		3.9
+CG-CD-N			PRO		103.2		1.5
+CA-N-CD			PRO		111.7		1.4
+N-CA-C			PRO		112.1		2.6
+CA-C-O			PRO		120.2		2.4
+N-CA-CB			SER		110.5		1.5
+CB-CA-C			SER		110.1		1.9
+CA-CB-OG		SER		111.2		2.7
+N-CA-C			SER		111.0		2.7
+CA-C-O			SER		120.1		2.1
+N-CA-CB			THR		110.3		1.9
+CB-CA-C			THR		111.6		2.7
+CA-CB-OG1		THR		109.0		2.1
+CA-CB-CG2		THR		112.4		1.4
+OG1-CB-CG2		THR		110.0		2.3
+N-CA-C			THR		111.0		2.7
+CA-C-O			THR		120.1		2.1
+N-CA-CB			TRP		110.6		1.8
+CB-CA-C			TRP		110.4		2.0
+CA-CB-CG		TRP		113.7		1.9
+CB-CG-CD1		TRP		127.0		1.3
+CB-CG-CD2		TRP		126.6		1.3
+CD1-CG-CD2		TRP		106.3		0.8
+CG-CD1-NE1		TRP		110.1		1.0
+CD1-NE1-CE2		TRP		109.0		0.9
+NE1-CE2-CD2		TRP		107.3		1.0
+CE2-CD2-CG		TRP		107.3		0.8
+CG-CD2-CE3		TRP		133.9		0.9
+NE1-CE2-CZ2		TRP		130.4		1.1
+CE3-CD2-CE2		TRP		118.7		1.2
+CD2-CE2-CZ2		TRP		122.3		1.2
+CE2-CZ2-CH2		TRP		117.4		1.0
+CZ2-CH2-CZ3		TRP		121.6		1.2
+CH2-CZ3-CE3		TRP		121.2		1.1
+CZ3-CE3-CD2		TRP		118.8		1.3
+N-CA-C			TRP		111.0		2.7
+CA-C-O			TRP		120.1		2.1
+N-CA-CB			TYR		110.6		1.8
+CB-CA-C			TYR		110.4		2.0
+CA-CB-CG		TYR		113.4		1.9
+CB-CG-CD1		TYR		121.0		0.6
+CB-CG-CD2		TYR		121.0		0.6
+CD1-CG-CD2		TYR		117.9		1.1
+CG-CD1-CE1		TYR		121.3		0.8
+CG-CD2-CE2		TYR		121.3		0.8
+CD1-CE1-CZ		TYR		119.8		0.9
+CD2-CE2-CZ		TYR		119.8		0.9
+CE1-CZ-CE2		TYR		119.8		1.6
+CE1-CZ-OH		TYR		120.1		2.7
+CE2-CZ-OH		TYR		120.1		2.7
+N-CA-C			TYR		111.0		2.7
+CA-C-O			TYR		120.1		2.1
+N-CA-CB			VAL		111.5		2.2
+CB-CA-C			VAL		111.4		1.9
+CA-CB-CG1		VAL		110.9		1.5
+CA-CB-CG2		VAL		110.9		1.5
+CG1-CB-CG2		VAL		110.9		1.6
+N-CA-C			VAL		111.0		2.7
+CA-C-O			VAL		120.1		2.1
+-
+
+Non-bonded distance     Minimum Dist    Tolerance
+C-C                     3.4             1.5
+C-N                     3.25            1.5
+C-S                     3.5             1.5
+C-O                     3.22            1.5
+N-N                     3.1             1.5
+N-S                     3.35            1.5
+N-O                     3.07            1.5
+O-S                     3.32            1.5
+O-O                     3.04            1.5
+S-S                     2.03            1.0
+-
diff --git a/modelscope/models/science/unifold/data/utils.py b/modelscope/models/science/unifold/data/utils.py
new file mode 100644
index 00000000..2be91ef0
--- /dev/null
+++ b/modelscope/models/science/unifold/data/utils.py
@@ -0,0 +1,161 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import copy as copy_lib
+import functools
+import gzip
+import pickle
+from typing import Any, Dict
+
+import json
+import numpy as np
+from scipy import sparse as sp
+
+from . import residue_constants as rc
+from .data_ops import NumpyDict
+
+# from typing import *
+
+
+def lru_cache(maxsize=16, typed=False, copy=False, deepcopy=False):
+    if deepcopy:
+
+        def decorator(f):
+            cached_func = functools.lru_cache(maxsize, typed)(f)
+
+            @functools.wraps(f)
+            def wrapper(*args, **kwargs):
+                return copy_lib.deepcopy(cached_func(*args, **kwargs))
+
+            return wrapper
+
+    elif copy:
+
+        def decorator(f):
+            cached_func = functools.lru_cache(maxsize, typed)(f)
+
+            @functools.wraps(f)
+            def wrapper(*args, **kwargs):
+                return copy_lib.copy(cached_func(*args, **kwargs))
+
+            return wrapper
+
+    else:
+        decorator = functools.lru_cache(maxsize, typed)
+    return decorator
+
+
+@lru_cache(maxsize=8, deepcopy=True)
+def load_pickle_safe(path: str) -> Dict[str, Any]:
+
+    def load(path):
+        assert path.endswith('.pkl') or path.endswith(
+            '.pkl.gz'), f'bad suffix in {path} as pickle file.'
+        open_fn = gzip.open if path.endswith('.gz') else open
+        with open_fn(path, 'rb') as f:
+            return pickle.load(f)
+
+    ret = load(path)
+    ret = uncompress_features(ret)
+    return ret
+
+
+@lru_cache(maxsize=8, copy=True)
+def load_pickle(path: str) -> Dict[str, Any]:
+
+    def load(path):
+        assert path.endswith('.pkl') or path.endswith(
+            '.pkl.gz'), f'bad suffix in {path} as pickle file.'
+        open_fn = gzip.open if path.endswith('.gz') else open
+        with open_fn(path, 'rb') as f:
+            return pickle.load(f)
+
+    ret = load(path)
+    ret = uncompress_features(ret)
+    return ret
+
+
+def correct_template_restypes(feature):
+    """Correct template restype to have the same order as residue_constants."""
+    feature = np.argmax(feature, axis=-1).astype(np.int32)
+    new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+    feature = np.take(new_order_list, feature.astype(np.int32), axis=0)
+    return feature
+
+
+def convert_all_seq_feature(feature: NumpyDict) -> NumpyDict:
+    feature['msa'] = feature['msa'].astype(np.uint8)
+    if 'num_alignments' in feature:
+        feature.pop('num_alignments')
+    # make_all_seq_key = lambda k: f'{k}_all_seq' if not k.endswith('_all_seq') else k
+
+    def make_all_seq_key(k):
+        if not k.endswith('_all_seq'):
+            return f'{k}_all_seq'
+        return k
+
+    return {make_all_seq_key(k): v for k, v in feature.items()}
+
+
+def to_dense_matrix(spmat_dict: NumpyDict):
+    spmat = sp.coo_matrix(
+        (spmat_dict['data'], (spmat_dict['row'], spmat_dict['col'])),
+        shape=spmat_dict['shape'],
+        dtype=np.float32,
+    )
+    return spmat.toarray()
+
+
+FEATS_DTYPE = {'msa': np.int32}
+
+
+def uncompress_features(feats: NumpyDict) -> NumpyDict:
+    if 'sparse_deletion_matrix_int' in feats:
+        v = feats.pop('sparse_deletion_matrix_int')
+        v = to_dense_matrix(v)
+        feats['deletion_matrix'] = v
+    return feats
+
+
+def filter(feature: NumpyDict, **kwargs) -> NumpyDict:
+    assert len(kwargs) == 1, f'wrong usage of filter with kwargs: {kwargs}'
+    if 'desired_keys' in kwargs:
+        feature = {
+            k: v
+            for k, v in feature.items() if k in kwargs['desired_keys']
+        }
+    elif 'required_keys' in kwargs:
+        for k in kwargs['required_keys']:
+            assert k in feature, f'cannot find required key {k}.'
+    elif 'ignored_keys' in kwargs:
+        feature = {
+            k: v
+            for k, v in feature.items() if k not in kwargs['ignored_keys']
+        }
+    else:
+        raise AssertionError(f'wrong usage of filter with kwargs: {kwargs}')
+    return feature
+
+
+def compress_features(features: NumpyDict):
+    change_dtype = {
+        'msa': np.uint8,
+    }
+    sparse_keys = ['deletion_matrix_int']
+
+    compressed_features = {}
+    for k, v in features.items():
+        if k in change_dtype:
+            v = v.astype(change_dtype[k])
+        if k in sparse_keys:
+            v = sp.coo_matrix(v, dtype=v.dtype)
+            sp_v = {
+                'shape': v.shape,
+                'row': v.row,
+                'col': v.col,
+                'data': v.data
+            }
+            k = f'sparse_{k}'
+            v = sp_v
+        compressed_features[k] = v
+    return compressed_features
diff --git a/modelscope/models/science/unifold/dataset.py b/modelscope/models/science/unifold/dataset.py
new file mode 100644
index 00000000..05803f2c
--- /dev/null
+++ b/modelscope/models/science/unifold/dataset.py
@@ -0,0 +1,514 @@
+import copy
+import logging
+import os
+# from typing import *
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import json
+import ml_collections as mlc
+import numpy as np
+import torch
+from unicore.data import UnicoreDataset, data_utils
+from unicore.distributed import utils as distributed_utils
+
+from .data import utils
+from .data.data_ops import NumpyDict, TorchDict
+from .data.process import process_features, process_labels
+from .data.process_multimer import (add_assembly_features,
+                                    convert_monomer_features, merge_msas,
+                                    pair_and_merge, post_process)
+
+Rotation = Iterable[Iterable]
+Translation = Iterable
+Operation = Union[str, Tuple[Rotation, Translation]]
+NumpyExample = Tuple[NumpyDict, Optional[List[NumpyDict]]]
+TorchExample = Tuple[TorchDict, Optional[List[TorchDict]]]
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def make_data_config(
+    config: mlc.ConfigDict,
+    mode: str,
+    num_res: int,
+) -> Tuple[mlc.ConfigDict, List[str]]:
+    cfg = copy.deepcopy(config)
+    mode_cfg = cfg[mode]
+    with cfg.unlocked():
+        if mode_cfg.crop_size is None:
+            mode_cfg.crop_size = num_res
+    feature_names = cfg.common.unsupervised_features + cfg.common.recycling_features
+    if cfg.common.use_templates:
+        feature_names += cfg.common.template_features
+    if cfg.common.is_multimer:
+        feature_names += cfg.common.multimer_features
+    if cfg[mode].supervised:
+        feature_names += cfg.supervised.supervised_features
+
+    return cfg, feature_names
+
+
+def process_label(all_atom_positions: np.ndarray,
+                  operation: Operation) -> np.ndarray:
+    if operation == 'I':
+        return all_atom_positions
+    rot, trans = operation
+    rot = np.array(rot).reshape(3, 3)
+    trans = np.array(trans).reshape(3)
+    return all_atom_positions @ rot.T + trans
+
+
+@utils.lru_cache(maxsize=8, copy=True)
+def load_single_feature(
+    sequence_id: str,
+    monomer_feature_dir: str,
+    uniprot_msa_dir: Optional[str] = None,
+    is_monomer: bool = False,
+) -> NumpyDict:
+
+    monomer_feature = utils.load_pickle(
+        os.path.join(monomer_feature_dir, f'{sequence_id}.feature.pkl.gz'))
+    monomer_feature = convert_monomer_features(monomer_feature)
+    chain_feature = {**monomer_feature}
+
+    if uniprot_msa_dir is not None:
+        all_seq_feature = utils.load_pickle(
+            os.path.join(uniprot_msa_dir, f'{sequence_id}.uniprot.pkl.gz'))
+        if is_monomer:
+            chain_feature['msa'], chain_feature[
+                'deletion_matrix'] = merge_msas(
+                    chain_feature['msa'],
+                    chain_feature['deletion_matrix'],
+                    all_seq_feature['msa'],
+                    all_seq_feature['deletion_matrix'],
+                )  # noqa
+        else:
+            all_seq_feature = utils.convert_all_seq_feature(all_seq_feature)
+            for key in [
+                    'msa_all_seq',
+                    'msa_species_identifiers_all_seq',
+                    'deletion_matrix_all_seq',
+            ]:
+                chain_feature[key] = all_seq_feature[key]
+
+    return chain_feature
+
+
+def load_single_label(
+    label_id: str,
+    label_dir: str,
+    symmetry_operation: Optional[Operation] = None,
+) -> NumpyDict:
+    label = utils.load_pickle(
+        os.path.join(label_dir, f'{label_id}.label.pkl.gz'))
+    if symmetry_operation is not None:
+        label['all_atom_positions'] = process_label(
+            label['all_atom_positions'], symmetry_operation)
+    label = {
+        k: v
+        for k, v in label.items() if k in
+        ['aatype', 'all_atom_positions', 'all_atom_mask', 'resolution']
+    }
+    return label
+
+
+def load(
+    sequence_ids: List[str],
+    monomer_feature_dir: str,
+    uniprot_msa_dir: Optional[str] = None,
+    label_ids: Optional[List[str]] = None,
+    label_dir: Optional[str] = None,
+    symmetry_operations: Optional[List[Operation]] = None,
+    is_monomer: bool = False,
+) -> NumpyExample:
+
+    all_chain_features = [
+        load_single_feature(s, monomer_feature_dir, uniprot_msa_dir,
+                            is_monomer) for s in sequence_ids
+    ]
+
+    if label_ids is not None:
+        # load labels
+        assert len(label_ids) == len(sequence_ids)
+        assert label_dir is not None
+        if symmetry_operations is None:
+            symmetry_operations = ['I' for _ in label_ids]
+        all_chain_labels = [
+            load_single_label(ll, label_dir, o)
+            for ll, o in zip(label_ids, symmetry_operations)
+        ]
+        # update labels into features to calculate spatial cropping etc.
+        [f.update(ll) for f, ll in zip(all_chain_features, all_chain_labels)]
+
+    all_chain_features = add_assembly_features(all_chain_features)
+
+    # get labels back from features, as add_assembly_features may alter the order of inputs.
+    if label_ids is not None:
+        all_chain_labels = [{
+            k: f[k]
+            for k in
+            ['aatype', 'all_atom_positions', 'all_atom_mask', 'resolution']
+        } for f in all_chain_features]
+    else:
+        all_chain_labels = None
+
+    asym_len = np.array([c['seq_length'] for c in all_chain_features],
+                        dtype=np.int64)
+    if is_monomer:
+        all_chain_features = all_chain_features[0]
+    else:
+        all_chain_features = pair_and_merge(all_chain_features)
+        all_chain_features = post_process(all_chain_features)
+    all_chain_features['asym_len'] = asym_len
+
+    return all_chain_features, all_chain_labels
+
+
+def process(
+    config: mlc.ConfigDict,
+    mode: str,
+    features: NumpyDict,
+    labels: Optional[List[NumpyDict]] = None,
+    seed: int = 0,
+    batch_idx: Optional[int] = None,
+    data_idx: Optional[int] = None,
+    is_distillation: bool = False,
+) -> TorchExample:
+
+    if mode == 'train':
+        assert batch_idx is not None
+        with data_utils.numpy_seed(seed, batch_idx, key='recycling'):
+            num_iters = np.random.randint(
+                0, config.common.max_recycling_iters + 1)
+            use_clamped_fape = np.random.rand(
+            ) < config[mode].use_clamped_fape_prob
+    else:
+        num_iters = config.common.max_recycling_iters
+        use_clamped_fape = 1
+
+    features['num_recycling_iters'] = int(num_iters)
+    features['use_clamped_fape'] = int(use_clamped_fape)
+    features['is_distillation'] = int(is_distillation)
+    if is_distillation and 'msa_chains' in features:
+        features.pop('msa_chains')
+
+    num_res = int(features['seq_length'])
+    cfg, feature_names = make_data_config(config, mode=mode, num_res=num_res)
+
+    if labels is not None:
+        features['resolution'] = labels[0]['resolution'].reshape(-1)
+
+    with data_utils.numpy_seed(seed, data_idx, key='protein_feature'):
+        features['crop_and_fix_size_seed'] = np.random.randint(0, 63355)
+        features = utils.filter(features, desired_keys=feature_names)
+        features = {k: torch.tensor(v) for k, v in features.items()}
+        with torch.no_grad():
+            features = process_features(features, cfg.common, cfg[mode])
+
+    if labels is not None:
+        labels = [{k: torch.tensor(v) for k, v in ll.items()} for ll in labels]
+        with torch.no_grad():
+            labels = process_labels(labels)
+
+    return features, labels
+
+
+def load_and_process(
+    config: mlc.ConfigDict,
+    mode: str,
+    seed: int = 0,
+    batch_idx: Optional[int] = None,
+    data_idx: Optional[int] = None,
+    is_distillation: bool = False,
+    **load_kwargs,
+):
+    is_monomer = (
+        is_distillation
+        if 'is_monomer' not in load_kwargs else load_kwargs.pop('is_monomer'))
+    features, labels = load(**load_kwargs, is_monomer=is_monomer)
+    features, labels = process(config, mode, features, labels, seed, batch_idx,
+                               data_idx, is_distillation)
+    return features, labels
+
+
+class UnifoldDataset(UnicoreDataset):
+
+    def __init__(
+        self,
+        args,
+        seed,
+        config,
+        data_path,
+        mode='train',
+        max_step=None,
+        disable_sd=False,
+        json_prefix='',
+    ):
+        self.path = data_path
+
+        def load_json(filename):
+            return json.load(open(filename, 'r'))
+
+        sample_weight = load_json(
+            os.path.join(self.path,
+                         json_prefix + mode + '_sample_weight.json'))
+        self.multi_label = load_json(
+            os.path.join(self.path, json_prefix + mode + '_multi_label.json'))
+        self.inverse_multi_label = self._inverse_map(self.multi_label)
+        self.sample_weight = {}
+        for chain in self.inverse_multi_label:
+            entity = self.inverse_multi_label[chain]
+            self.sample_weight[chain] = sample_weight[entity]
+        self.seq_sample_weight = sample_weight
+        logger.info('load {} chains (unique {} sequences)'.format(
+            len(self.sample_weight), len(self.seq_sample_weight)))
+        self.feature_path = os.path.join(self.path, 'pdb_features')
+        self.label_path = os.path.join(self.path, 'pdb_labels')
+        sd_sample_weight_path = os.path.join(
+            self.path, json_prefix + 'sd_train_sample_weight.json')
+        if mode == 'train' and os.path.isfile(
+                sd_sample_weight_path) and not disable_sd:
+            self.sd_sample_weight = load_json(sd_sample_weight_path)
+            logger.info('load {} self-distillation samples.'.format(
+                len(self.sd_sample_weight)))
+            self.sd_feature_path = os.path.join(self.path, 'sd_features')
+            self.sd_label_path = os.path.join(self.path, 'sd_labels')
+        else:
+            self.sd_sample_weight = None
+        self.batch_size = (
+            args.batch_size * distributed_utils.get_data_parallel_world_size()
+            * args.update_freq[0])
+        self.data_len = (
+            max_step * self.batch_size
+            if max_step is not None else len(self.sample_weight))
+        self.mode = mode
+        self.num_seq, self.seq_keys, self.seq_sample_prob = self.cal_sample_weight(
+            self.seq_sample_weight)
+        self.num_chain, self.chain_keys, self.sample_prob = self.cal_sample_weight(
+            self.sample_weight)
+        if self.sd_sample_weight is not None:
+            (
+                self.sd_num_chain,
+                self.sd_chain_keys,
+                self.sd_sample_prob,
+            ) = self.cal_sample_weight(self.sd_sample_weight)
+        self.config = config.data
+        self.seed = seed
+        self.sd_prob = args.sd_prob
+
+    def cal_sample_weight(self, sample_weight):
+        prot_keys = list(sample_weight.keys())
+        sum_weight = sum(sample_weight.values())
+        sample_prob = [sample_weight[k] / sum_weight for k in prot_keys]
+        num_prot = len(prot_keys)
+        return num_prot, prot_keys, sample_prob
+
+    def sample_chain(self, idx, sample_by_seq=False):
+        is_distillation = False
+        if self.mode == 'train':
+            with data_utils.numpy_seed(self.seed, idx, key='data_sample'):
+                is_distillation = ((np.random.rand(1)[0] < self.sd_prob)
+                                   if self.sd_sample_weight is not None else
+                                   False)
+                if is_distillation:
+                    prot_idx = np.random.choice(
+                        self.sd_num_chain, p=self.sd_sample_prob)
+                    label_name = self.sd_chain_keys[prot_idx]
+                    seq_name = label_name
+                else:
+                    if not sample_by_seq:
+                        prot_idx = np.random.choice(
+                            self.num_chain, p=self.sample_prob)
+                        label_name = self.chain_keys[prot_idx]
+                        seq_name = self.inverse_multi_label[label_name]
+                    else:
+                        seq_idx = np.random.choice(
+                            self.num_seq, p=self.seq_sample_prob)
+                        seq_name = self.seq_keys[seq_idx]
+                        label_name = np.random.choice(
+                            self.multi_label[seq_name])
+        else:
+            label_name = self.chain_keys[idx]
+            seq_name = self.inverse_multi_label[label_name]
+        return seq_name, label_name, is_distillation
+
+    def __getitem__(self, idx):
+        sequence_id, label_id, is_distillation = self.sample_chain(
+            idx, sample_by_seq=True)
+        feature_dir, label_dir = ((self.feature_path,
+                                   self.label_path) if not is_distillation else
+                                  (self.sd_feature_path, self.sd_label_path))
+        features, _ = load_and_process(
+            self.config,
+            self.mode,
+            self.seed,
+            batch_idx=(idx // self.batch_size),
+            data_idx=idx,
+            is_distillation=is_distillation,
+            sequence_ids=[sequence_id],
+            monomer_feature_dir=feature_dir,
+            uniprot_msa_dir=None,
+            label_ids=[label_id],
+            label_dir=label_dir,
+            symmetry_operations=None,
+            is_monomer=True,
+        )
+        return features
+
+    def __len__(self):
+        return self.data_len
+
+    @staticmethod
+    def collater(samples):
+        # first dim is recyling. bsz is at the 2nd dim
+        return data_utils.collate_dict(samples, dim=1)
+
+    @staticmethod
+    def _inverse_map(mapping: Dict[str, List[str]]):
+        inverse_mapping = {}
+        for ent, refs in mapping.items():
+            for ref in refs:
+                if ref in inverse_mapping:  # duplicated ent for this ref.
+                    ent_2 = inverse_mapping[ref]
+                    assert (
+                        ent == ent_2
+                    ), f'multiple entities ({ent_2}, {ent}) exist for reference {ref}.'
+                inverse_mapping[ref] = ent
+        return inverse_mapping
+
+
+class UnifoldMultimerDataset(UnifoldDataset):
+
+    def __init__(
+        self,
+        args: mlc.ConfigDict,
+        seed: int,
+        config: mlc.ConfigDict,
+        data_path: str,
+        mode: str = 'train',
+        max_step: Optional[int] = None,
+        disable_sd: bool = False,
+        json_prefix: str = '',
+        **kwargs,
+    ):
+        super().__init__(args, seed, config, data_path, mode, max_step,
+                         disable_sd, json_prefix)
+        self.data_path = data_path
+        self.pdb_assembly = json.load(
+            open(
+                os.path.join(self.data_path,
+                             json_prefix + 'pdb_assembly.json')))
+        self.pdb_chains = self.get_chains(self.inverse_multi_label)
+        self.monomer_feature_path = os.path.join(self.data_path,
+                                                 'pdb_features')
+        self.uniprot_msa_path = os.path.join(self.data_path, 'pdb_uniprots')
+        self.label_path = os.path.join(self.data_path, 'pdb_labels')
+        self.max_chains = args.max_chains
+        if self.mode == 'train':
+            self.pdb_chains, self.sample_weight = self.filter_pdb_by_max_chains(
+                self.pdb_chains, self.pdb_assembly, self.sample_weight,
+                self.max_chains)
+            self.num_chain, self.chain_keys, self.sample_prob = self.cal_sample_weight(
+                self.sample_weight)
+
+    def __getitem__(self, idx):
+        seq_id, label_id, is_distillation = self.sample_chain(idx)
+        if is_distillation:
+            label_ids = [label_id]
+            sequence_ids = [seq_id]
+            monomer_feature_path, uniprot_msa_path, label_path = (
+                self.sd_feature_path,
+                None,
+                self.sd_label_path,
+            )
+            symmetry_operations = None
+        else:
+            pdb_id = self.get_pdb_name(label_id)
+            if pdb_id in self.pdb_assembly and self.mode == 'train':
+                label_ids = [
+                    pdb_id + '_' + id
+                    for id in self.pdb_assembly[pdb_id]['chains']
+                ]
+                symmetry_operations = [
+                    t for t in self.pdb_assembly[pdb_id]['opers']
+                ]
+            else:
+                label_ids = self.pdb_chains[pdb_id]
+                symmetry_operations = None
+            sequence_ids = [
+                self.inverse_multi_label[chain_id] for chain_id in label_ids
+            ]
+            monomer_feature_path, uniprot_msa_path, label_path = (
+                self.monomer_feature_path,
+                self.uniprot_msa_path,
+                self.label_path,
+            )
+
+        return load_and_process(
+            self.config,
+            self.mode,
+            self.seed,
+            batch_idx=(idx // self.batch_size),
+            data_idx=idx,
+            is_distillation=is_distillation,
+            sequence_ids=sequence_ids,
+            monomer_feature_dir=monomer_feature_path,
+            uniprot_msa_dir=uniprot_msa_path,
+            label_ids=label_ids,
+            label_dir=label_path,
+            symmetry_operations=symmetry_operations,
+            is_monomer=False,
+        )
+
+    @staticmethod
+    def collater(samples):
+        # first dim is recyling. bsz is at the 2nd dim
+        if len(samples) <= 0:  # tackle empty batch
+            return None
+        feats = [s[0] for s in samples]
+        labs = [s[1] for s in samples if s[1] is not None]
+        try:
+            feats = data_utils.collate_dict(feats, dim=1)
+        except BaseException:
+            raise ValueError('cannot collate features', feats)
+        if not labs:
+            labs = None
+        return feats, labs
+
+    @staticmethod
+    def get_pdb_name(chain):
+        return chain.split('_')[0]
+
+    @staticmethod
+    def get_chains(canon_chain_map):
+        pdb_chains = {}
+        for chain in canon_chain_map:
+            pdb = UnifoldMultimerDataset.get_pdb_name(chain)
+            if pdb not in pdb_chains:
+                pdb_chains[pdb] = []
+            pdb_chains[pdb].append(chain)
+        return pdb_chains
+
+    @staticmethod
+    def filter_pdb_by_max_chains(pdb_chains, pdb_assembly, sample_weight,
+                                 max_chains):
+        new_pdb_chains = {}
+        for chain in pdb_chains:
+            if chain in pdb_assembly:
+                size = len(pdb_assembly[chain]['chains'])
+                if size <= max_chains:
+                    new_pdb_chains[chain] = pdb_chains[chain]
+            else:
+                size = len(pdb_chains[chain])
+                if size == 1:
+                    new_pdb_chains[chain] = pdb_chains[chain]
+        new_sample_weight = {
+            k: sample_weight[k]
+            for k in sample_weight
+            if UnifoldMultimerDataset.get_pdb_name(k) in new_pdb_chains
+        }
+        logger.info(
+            f'filtered out {len(pdb_chains) - len(new_pdb_chains)} / {len(pdb_chains)} PDBs '
+            f'({len(sample_weight) - len(new_sample_weight)} / {len(sample_weight)} chains) '
+            f'by max_chains {max_chains}')
+        return new_pdb_chains, new_sample_weight
diff --git a/modelscope/models/science/unifold/model.py b/modelscope/models/science/unifold/model.py
new file mode 100644
index 00000000..6632751a
--- /dev/null
+++ b/modelscope/models/science/unifold/model.py
@@ -0,0 +1,75 @@
+import argparse
+import os
+from typing import Any
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .config import model_config
+from .modules.alphafold import AlphaFold
+
+__all__ = ['UnifoldForProteinStructrue']
+
+
+@MODELS.register_module(Tasks.protein_structure, module_name=Models.unifold)
+class UnifoldForProteinStructrue(TorchModel):
+
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        parser.add_argument(
+            '--model-name',
+            help='choose the model config',
+        )
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        parser = argparse.ArgumentParser()
+        parse_comm = []
+        for key in kwargs:
+            parser.add_argument(f'--{key}')
+            parse_comm.append(f'--{key}')
+            parse_comm.append(kwargs[key])
+        args = parser.parse_args(parse_comm)
+        base_architecture(args)
+        self.args = args
+        config = model_config(
+            self.args.model_name,
+            train=True,
+        )
+        self.model = AlphaFold(config)
+        self.config = config
+
+        # load model state dict
+        param_path = os.path.join(kwargs['model_dir'],
+                                  ModelFile.TORCH_MODEL_BIN_FILE)
+        state_dict = torch.load(param_path)['ema']['params']
+        state_dict = {
+            '.'.join(k.split('.')[1:]): v
+            for k, v in state_dict.items()
+        }
+        self.model.load_state_dict(state_dict)
+
+    def half(self):
+        self.model = self.model.half()
+        return self
+
+    def bfloat16(self):
+        self.model = self.model.bfloat16()
+        return self
+
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        return cls(args)
+
+    def forward(self, batch, **kwargs):
+        outputs = self.model.forward(batch)
+        return outputs, self.config.loss
+
+
+def base_architecture(args):
+    args.model_name = getattr(args, 'model_name', 'model_2')
diff --git a/modelscope/models/science/unifold/modules/alphafold.py b/modelscope/models/science/unifold/modules/alphafold.py
new file mode 100644
index 00000000..71a1b310
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/alphafold.py
@@ -0,0 +1,450 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import torch
+import torch.nn as nn
+from unicore.utils import tensor_tree_map
+
+from ..data import residue_constants
+from .attentions import gen_msa_attn_mask, gen_tri_attn_mask
+from .auxillary_heads import AuxiliaryHeads
+from .common import residual
+from .embedders import (ExtraMSAEmbedder, InputEmbedder, RecyclingEmbedder,
+                        TemplateAngleEmbedder, TemplatePairEmbedder)
+from .evoformer import EvoformerStack, ExtraMSAStack
+from .featurization import (atom14_to_atom37, build_extra_msa_feat,
+                            build_template_angle_feat,
+                            build_template_pair_feat,
+                            build_template_pair_feat_v2, pseudo_beta_fn)
+from .structure_module import StructureModule
+from .template import (TemplatePairStack, TemplatePointwiseAttention,
+                       TemplateProjection)
+
+
+class AlphaFold(nn.Module):
+
+    def __init__(self, config):
+        super(AlphaFold, self).__init__()
+
+        self.globals = config.globals
+        config = config.model
+        template_config = config.template
+        extra_msa_config = config.extra_msa
+
+        self.input_embedder = InputEmbedder(
+            **config['input_embedder'],
+            use_chain_relative=config.is_multimer,
+        )
+        self.recycling_embedder = RecyclingEmbedder(
+            **config['recycling_embedder'], )
+        if config.template.enabled:
+            self.template_angle_embedder = TemplateAngleEmbedder(
+                **template_config['template_angle_embedder'], )
+            self.template_pair_embedder = TemplatePairEmbedder(
+                **template_config['template_pair_embedder'], )
+            self.template_pair_stack = TemplatePairStack(
+                **template_config['template_pair_stack'], )
+        else:
+            self.template_pair_stack = None
+        self.enable_template_pointwise_attention = template_config[
+            'template_pointwise_attention'].enabled
+        if self.enable_template_pointwise_attention:
+            self.template_pointwise_att = TemplatePointwiseAttention(
+                **template_config['template_pointwise_attention'], )
+        else:
+            self.template_proj = TemplateProjection(
+                **template_config['template_pointwise_attention'], )
+        self.extra_msa_embedder = ExtraMSAEmbedder(
+            **extra_msa_config['extra_msa_embedder'], )
+        self.extra_msa_stack = ExtraMSAStack(
+            **extra_msa_config['extra_msa_stack'], )
+        self.evoformer = EvoformerStack(**config['evoformer_stack'], )
+        self.structure_module = StructureModule(**config['structure_module'], )
+
+        self.aux_heads = AuxiliaryHeads(config['heads'], )
+
+        self.config = config
+        self.dtype = torch.float
+        self.inf = self.globals.inf
+        if self.globals.alphafold_original_mode:
+            self.alphafold_original_mode()
+
+    def __make_input_float__(self):
+        self.input_embedder = self.input_embedder.float()
+        self.recycling_embedder = self.recycling_embedder.float()
+
+    def half(self):
+        super().half()
+        if (not getattr(self, 'inference', False)):
+            self.__make_input_float__()
+        self.dtype = torch.half
+        return self
+
+    def bfloat16(self):
+        super().bfloat16()
+        if (not getattr(self, 'inference', False)):
+            self.__make_input_float__()
+        self.dtype = torch.bfloat16
+        return self
+
+    def alphafold_original_mode(self):
+
+        def set_alphafold_original_mode(module):
+            if hasattr(module, 'apply_alphafold_original_mode'):
+                module.apply_alphafold_original_mode()
+            if hasattr(module, 'act'):
+                module.act = nn.ReLU()
+
+        self.apply(set_alphafold_original_mode)
+
+    def inference_mode(self):
+
+        def set_inference_mode(module):
+            setattr(module, 'inference', True)
+
+        self.apply(set_inference_mode)
+
+    def __convert_input_dtype__(self, batch):
+        for key in batch:
+            # only convert features with mask
+            if batch[key].dtype != self.dtype and 'mask' in key:
+                batch[key] = batch[key].type(self.dtype)
+        return batch
+
+    def embed_templates_pair_core(self, batch, z, pair_mask,
+                                  tri_start_attn_mask, tri_end_attn_mask,
+                                  templ_dim, multichain_mask_2d):
+        if self.config.template.template_pair_embedder.v2_feature:
+            t = build_template_pair_feat_v2(
+                batch,
+                inf=self.config.template.inf,
+                eps=self.config.template.eps,
+                multichain_mask_2d=multichain_mask_2d,
+                **self.config.template.distogram,
+            )
+            num_template = t[0].shape[-4]
+            single_templates = [
+                self.template_pair_embedder([x[..., ti, :, :, :]
+                                             for x in t], z)
+                for ti in range(num_template)
+            ]
+        else:
+            t = build_template_pair_feat(
+                batch,
+                inf=self.config.template.inf,
+                eps=self.config.template.eps,
+                **self.config.template.distogram,
+            )
+            single_templates = [
+                self.template_pair_embedder(x, z)
+                for x in torch.unbind(t, dim=templ_dim)
+            ]
+
+        t = self.template_pair_stack(
+            single_templates,
+            pair_mask,
+            tri_start_attn_mask=tri_start_attn_mask,
+            tri_end_attn_mask=tri_end_attn_mask,
+            templ_dim=templ_dim,
+            chunk_size=self.globals.chunk_size,
+            block_size=self.globals.block_size,
+            return_mean=not self.enable_template_pointwise_attention,
+        )
+        return t
+
+    def embed_templates_pair(self, batch, z, pair_mask, tri_start_attn_mask,
+                             tri_end_attn_mask, templ_dim):
+        if self.config.template.template_pair_embedder.v2_feature and 'asym_id' in batch:
+            multichain_mask_2d = (
+                batch['asym_id'][..., :, None] == batch['asym_id'][...,
+                                                                   None, :])
+            multichain_mask_2d = multichain_mask_2d.unsqueeze(0)
+        else:
+            multichain_mask_2d = None
+
+        if self.training or self.enable_template_pointwise_attention:
+            t = self.embed_templates_pair_core(batch, z, pair_mask,
+                                               tri_start_attn_mask,
+                                               tri_end_attn_mask, templ_dim,
+                                               multichain_mask_2d)
+            if self.enable_template_pointwise_attention:
+                t = self.template_pointwise_att(
+                    t,
+                    z,
+                    template_mask=batch['template_mask'],
+                    chunk_size=self.globals.chunk_size,
+                )
+                t_mask = torch.sum(
+                    batch['template_mask'], dim=-1, keepdims=True) > 0
+                t_mask = t_mask[..., None, None].type(t.dtype)
+                t *= t_mask
+            else:
+                t = self.template_proj(t, z)
+        else:
+            template_aatype_shape = batch['template_aatype'].shape
+            # template_aatype is either [n_template, n_res] or [1, n_template_, n_res]
+            batch_templ_dim = 1 if len(template_aatype_shape) == 3 else 0
+            n_templ = batch['template_aatype'].shape[batch_templ_dim]
+
+            if n_templ <= 0:
+                t = None
+            else:
+                template_batch = {
+                    k: v
+                    for k, v in batch.items() if k.startswith('template_')
+                }
+
+                def embed_one_template(i):
+
+                    def slice_template_tensor(t):
+                        s = [slice(None) for _ in t.shape]
+                        s[batch_templ_dim] = slice(i, i + 1)
+                        return t[s]
+
+                    template_feats = tensor_tree_map(
+                        slice_template_tensor,
+                        template_batch,
+                    )
+                    t = self.embed_templates_pair_core(
+                        template_feats, z, pair_mask, tri_start_attn_mask,
+                        tri_end_attn_mask, templ_dim, multichain_mask_2d)
+                    return t
+
+                t = embed_one_template(0)
+                # iterate templates one by one
+                for i in range(1, n_templ):
+                    t += embed_one_template(i)
+                t /= n_templ
+            t = self.template_proj(t, z)
+        return t
+
+    def embed_templates_angle(self, batch):
+        template_angle_feat, template_angle_mask = build_template_angle_feat(
+            batch,
+            v2_feature=self.config.template.template_pair_embedder.v2_feature)
+        t = self.template_angle_embedder(template_angle_feat)
+        return t, template_angle_mask
+
+    def iteration_evoformer(self, feats, m_1_prev, z_prev, x_prev):
+        batch_dims = feats['target_feat'].shape[:-2]
+        n = feats['target_feat'].shape[-2]
+        seq_mask = feats['seq_mask']
+        pair_mask = seq_mask[..., None] * seq_mask[..., None, :]
+        msa_mask = feats['msa_mask']
+
+        m, z = self.input_embedder(
+            feats['target_feat'],
+            feats['msa_feat'],
+        )
+
+        if m_1_prev is None:
+            m_1_prev = m.new_zeros(
+                (*batch_dims, n, self.config.input_embedder.d_msa),
+                requires_grad=False,
+            )
+        if z_prev is None:
+            z_prev = z.new_zeros(
+                (*batch_dims, n, n, self.config.input_embedder.d_pair),
+                requires_grad=False,
+            )
+        if x_prev is None:
+            x_prev = z.new_zeros(
+                (*batch_dims, n, residue_constants.atom_type_num, 3),
+                requires_grad=False,
+            )
+        x_prev = pseudo_beta_fn(feats['aatype'], x_prev, None)
+
+        z += self.recycling_embedder.recyle_pos(x_prev)
+
+        m_1_prev_emb, z_prev_emb = self.recycling_embedder(
+            m_1_prev,
+            z_prev,
+        )
+
+        m[..., 0, :, :] += m_1_prev_emb
+
+        z += z_prev_emb
+
+        z += self.input_embedder.relpos_emb(
+            feats['residue_index'].long(),
+            feats.get('sym_id', None),
+            feats.get('asym_id', None),
+            feats.get('entity_id', None),
+            feats.get('num_sym', None),
+        )
+
+        m = m.type(self.dtype)
+        z = z.type(self.dtype)
+        tri_start_attn_mask, tri_end_attn_mask = gen_tri_attn_mask(
+            pair_mask, self.inf)
+
+        if self.config.template.enabled:
+            template_mask = feats['template_mask']
+            if torch.any(template_mask):
+                z = residual(
+                    z,
+                    self.embed_templates_pair(
+                        feats,
+                        z,
+                        pair_mask,
+                        tri_start_attn_mask,
+                        tri_end_attn_mask,
+                        templ_dim=-4,
+                    ),
+                    self.training,
+                )
+
+        if self.config.extra_msa.enabled:
+            a = self.extra_msa_embedder(build_extra_msa_feat(feats))
+            extra_msa_row_mask = gen_msa_attn_mask(
+                feats['extra_msa_mask'],
+                inf=self.inf,
+                gen_col_mask=False,
+            )
+            z = self.extra_msa_stack(
+                a,
+                z,
+                msa_mask=feats['extra_msa_mask'],
+                chunk_size=self.globals.chunk_size,
+                block_size=self.globals.block_size,
+                pair_mask=pair_mask,
+                msa_row_attn_mask=extra_msa_row_mask,
+                msa_col_attn_mask=None,
+                tri_start_attn_mask=tri_start_attn_mask,
+                tri_end_attn_mask=tri_end_attn_mask,
+            )
+
+        if self.config.template.embed_angles:
+            template_1d_feat, template_1d_mask = self.embed_templates_angle(
+                feats)
+            m = torch.cat([m, template_1d_feat], dim=-3)
+            msa_mask = torch.cat([feats['msa_mask'], template_1d_mask], dim=-2)
+
+        msa_row_mask, msa_col_mask = gen_msa_attn_mask(
+            msa_mask,
+            inf=self.inf,
+        )
+
+        m, z, s = self.evoformer(
+            m,
+            z,
+            msa_mask=msa_mask,
+            pair_mask=pair_mask,
+            msa_row_attn_mask=msa_row_mask,
+            msa_col_attn_mask=msa_col_mask,
+            tri_start_attn_mask=tri_start_attn_mask,
+            tri_end_attn_mask=tri_end_attn_mask,
+            chunk_size=self.globals.chunk_size,
+            block_size=self.globals.block_size,
+        )
+        return m, z, s, msa_mask, m_1_prev_emb, z_prev_emb
+
+    def iteration_evoformer_structure_module(self,
+                                             batch,
+                                             m_1_prev,
+                                             z_prev,
+                                             x_prev,
+                                             cycle_no,
+                                             num_recycling,
+                                             num_ensembles=1):
+        z, s = 0, 0
+        n_seq = batch['msa_feat'].shape[-3]
+        assert num_ensembles >= 1
+        for ensemble_no in range(num_ensembles):
+            idx = cycle_no * num_ensembles + ensemble_no
+
+            # fetch_cur_batch = lambda t: t[min(t.shape[0] - 1, idx), ...]
+            def fetch_cur_batch(t):
+                return t[min(t.shape[0] - 1, idx), ...]
+
+            feats = tensor_tree_map(fetch_cur_batch, batch)
+            m, z0, s0, msa_mask, m_1_prev_emb, z_prev_emb = self.iteration_evoformer(
+                feats, m_1_prev, z_prev, x_prev)
+            z += z0
+            s += s0
+            del z0, s0
+        if num_ensembles > 1:
+            z /= float(num_ensembles)
+            s /= float(num_ensembles)
+
+        outputs = {}
+
+        outputs['msa'] = m[..., :n_seq, :, :]
+        outputs['pair'] = z
+        outputs['single'] = s
+
+        # norm loss
+        if (not getattr(self, 'inference',
+                        False)) and num_recycling == (cycle_no + 1):
+            delta_msa = m
+            delta_msa[...,
+                      0, :, :] = delta_msa[...,
+                                           0, :, :] - m_1_prev_emb.detach()
+            delta_pair = z - z_prev_emb.detach()
+            outputs['delta_msa'] = delta_msa
+            outputs['delta_pair'] = delta_pair
+            outputs['msa_norm_mask'] = msa_mask
+
+        outputs['sm'] = self.structure_module(
+            s,
+            z,
+            feats['aatype'],
+            mask=feats['seq_mask'],
+        )
+        outputs['final_atom_positions'] = atom14_to_atom37(
+            outputs['sm']['positions'], feats)
+        outputs['final_atom_mask'] = feats['atom37_atom_exists']
+        outputs['pred_frame_tensor'] = outputs['sm']['frames'][-1]
+
+        # use float32 for numerical stability
+        if (not getattr(self, 'inference', False)):
+            m_1_prev = m[..., 0, :, :].float()
+            z_prev = z.float()
+            x_prev = outputs['final_atom_positions'].float()
+        else:
+            m_1_prev = m[..., 0, :, :]
+            z_prev = z
+            x_prev = outputs['final_atom_positions']
+
+        return outputs, m_1_prev, z_prev, x_prev
+
+    def forward(self, batch):
+
+        m_1_prev = batch.get('m_1_prev', None)
+        z_prev = batch.get('z_prev', None)
+        x_prev = batch.get('x_prev', None)
+
+        is_grad_enabled = torch.is_grad_enabled()
+
+        num_iters = int(batch['num_recycling_iters']) + 1
+        num_ensembles = int(batch['msa_mask'].shape[0]) // num_iters
+        if self.training:
+            # don't use ensemble during training
+            assert num_ensembles == 1
+
+        # convert dtypes in batch
+        batch = self.__convert_input_dtype__(batch)
+        for cycle_no in range(num_iters):
+            is_final_iter = cycle_no == (num_iters - 1)
+            with torch.set_grad_enabled(is_grad_enabled and is_final_iter):
+                (
+                    outputs,
+                    m_1_prev,
+                    z_prev,
+                    x_prev,
+                ) = self.iteration_evoformer_structure_module(
+                    batch,
+                    m_1_prev,
+                    z_prev,
+                    x_prev,
+                    cycle_no=cycle_no,
+                    num_recycling=num_iters,
+                    num_ensembles=num_ensembles,
+                )
+            if not is_final_iter:
+                del outputs
+
+        if 'asym_id' in batch:
+            outputs['asym_id'] = batch['asym_id'][0, ...]
+        outputs.update(self.aux_heads(outputs))
+        return outputs
diff --git a/modelscope/models/science/unifold/modules/attentions.py b/modelscope/models/science/unifold/modules/attentions.py
new file mode 100644
index 00000000..d2319079
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/attentions.py
@@ -0,0 +1,430 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from functools import partialmethod
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from unicore.modules import LayerNorm, softmax_dropout
+from unicore.utils import permute_final_dims
+
+from .common import Linear, chunk_layer
+
+
+def gen_attn_mask(mask, neg_inf):
+    assert neg_inf < -1e4
+    attn_mask = torch.zeros_like(mask)
+    attn_mask[mask == 0] = neg_inf
+    return attn_mask
+
+
+class Attention(nn.Module):
+
+    def __init__(
+        self,
+        q_dim: int,
+        k_dim: int,
+        v_dim: int,
+        head_dim: int,
+        num_heads: int,
+        gating: bool = True,
+    ):
+        super(Attention, self).__init__()
+
+        self.num_heads = num_heads
+        total_dim = head_dim * self.num_heads
+        self.gating = gating
+        self.linear_q = Linear(q_dim, total_dim, bias=False, init='glorot')
+        self.linear_k = Linear(k_dim, total_dim, bias=False, init='glorot')
+        self.linear_v = Linear(v_dim, total_dim, bias=False, init='glorot')
+        self.linear_o = Linear(total_dim, q_dim, init='final')
+        self.linear_g = None
+        if self.gating:
+            self.linear_g = Linear(q_dim, total_dim, init='gating')
+        # precompute the 1/sqrt(head_dim)
+        self.norm = head_dim**-0.5
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        mask: torch.Tensor = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        g = None
+        if self.linear_g is not None:
+            # gating, use raw query input
+            g = self.linear_g(q)
+
+        q = self.linear_q(q)
+        q *= self.norm
+        k = self.linear_k(k)
+        v = self.linear_v(v)
+
+        q = q.view(q.shape[:-1] + (self.num_heads, -1)).transpose(
+            -2, -3).contiguous()
+        k = k.view(k.shape[:-1] + (self.num_heads, -1)).transpose(
+            -2, -3).contiguous()
+        v = v.view(v.shape[:-1] + (self.num_heads, -1)).transpose(-2, -3)
+
+        attn = torch.matmul(q, k.transpose(-1, -2))
+        del q, k
+
+        attn = softmax_dropout(attn, 0, self.training, mask=mask, bias=bias)
+        o = torch.matmul(attn, v)
+        del attn, v
+
+        o = o.transpose(-2, -3).contiguous()
+        o = o.view(*o.shape[:-2], -1)
+
+        if g is not None:
+            o = torch.sigmoid(g) * o
+
+        # merge heads
+        o = nn.functional.linear(o, self.linear_o.weight)
+        return o
+
+    def get_output_bias(self):
+        return self.linear_o.bias
+
+
+class GlobalAttention(nn.Module):
+
+    def __init__(self, input_dim, head_dim, num_heads, inf, eps):
+        super(GlobalAttention, self).__init__()
+
+        self.num_heads = num_heads
+        self.inf = inf
+        self.eps = eps
+        self.linear_q = Linear(
+            input_dim, head_dim * num_heads, bias=False, init='glorot')
+        self.linear_k = Linear(input_dim, head_dim, bias=False, init='glorot')
+        self.linear_v = Linear(input_dim, head_dim, bias=False, init='glorot')
+        self.linear_g = Linear(input_dim, head_dim * num_heads, init='gating')
+        self.linear_o = Linear(head_dim * num_heads, input_dim, init='final')
+        self.sigmoid = nn.Sigmoid()
+        # precompute the 1/sqrt(head_dim)
+        self.norm = head_dim**-0.5
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+
+        # gating
+        g = self.sigmoid(self.linear_g(x))
+
+        k = self.linear_k(x)
+        v = self.linear_v(x)
+
+        q = torch.sum(
+            x * mask.unsqueeze(-1), dim=-2) / (
+                torch.sum(mask, dim=-1, keepdims=True) + self.eps)
+        q = self.linear_q(q)
+        q *= self.norm
+        q = q.view(q.shape[:-1] + (self.num_heads, -1))
+
+        attn = torch.matmul(q, k.transpose(-1, -2))
+        del q, k
+
+        attn_mask = gen_attn_mask(mask, -self.inf)[..., :, None, :]
+        attn = softmax_dropout(attn, 0, self.training, mask=attn_mask)
+
+        o = torch.matmul(
+            attn,
+            v,
+        )
+        del attn, v
+
+        g = g.view(g.shape[:-1] + (self.num_heads, -1))
+        o = o.unsqueeze(-3) * g
+        del g
+
+        # merge heads
+        o = o.reshape(o.shape[:-2] + (-1, ))
+        return self.linear_o(o)
+
+
+def gen_msa_attn_mask(mask, inf, gen_col_mask=True):
+    row_mask = gen_attn_mask(mask, -inf)[..., :, None, None, :]
+    if gen_col_mask:
+        col_mask = gen_attn_mask(mask.transpose(-1, -2), -inf)[..., :, None,
+                                                               None, :]
+        return row_mask, col_mask
+    else:
+        return row_mask
+
+
+class MSAAttention(nn.Module):
+
+    def __init__(
+        self,
+        d_in,
+        d_hid,
+        num_heads,
+        pair_bias=False,
+        d_pair=None,
+    ):
+        super(MSAAttention, self).__init__()
+
+        self.pair_bias = pair_bias
+        self.layer_norm_m = LayerNorm(d_in)
+        self.layer_norm_z = None
+        self.linear_z = None
+        if self.pair_bias:
+            self.layer_norm_z = LayerNorm(d_pair)
+            self.linear_z = Linear(
+                d_pair, num_heads, bias=False, init='normal')
+
+        self.mha = Attention(d_in, d_in, d_in, d_hid, num_heads)
+
+    @torch.jit.ignore
+    def _chunk(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        chunk_size: int = None,
+    ) -> torch.Tensor:
+
+        return chunk_layer(
+            self._attn_forward,
+            {
+                'm': m,
+                'mask': mask,
+                'bias': bias
+            },
+            chunk_size=chunk_size,
+            num_batch_dims=len(m.shape[:-2]),
+        )
+
+    @torch.jit.ignore
+    def _attn_chunk_forward(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = 2560,
+    ) -> torch.Tensor:
+        m = self.layer_norm_m(m)
+        num_chunk = (m.shape[-3] + chunk_size - 1) // chunk_size
+        outputs = []
+        for i in range(num_chunk):
+            chunk_start = i * chunk_size
+            chunk_end = min(m.shape[-3], chunk_start + chunk_size)
+            cur_m = m[..., chunk_start:chunk_end, :, :]
+            cur_mask = (
+                mask[..., chunk_start:chunk_end, :, :, :]
+                if mask is not None else None)
+            outputs.append(
+                self.mha(q=cur_m, k=cur_m, v=cur_m, mask=cur_mask, bias=bias))
+        return torch.concat(outputs, dim=-3)
+
+    def _attn_forward(self, m, mask, bias: Optional[torch.Tensor] = None):
+        m = self.layer_norm_m(m)
+        return self.mha(q=m, k=m, v=m, mask=mask, bias=bias)
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        bias = None
+        if self.pair_bias:
+            z = self.layer_norm_z(z)
+            bias = (
+                permute_final_dims(self.linear_z(z),
+                                   (2, 0, 1)).unsqueeze(-4).contiguous())
+
+        if chunk_size is not None:
+            m = self._chunk(m, attn_mask, bias, chunk_size)
+        else:
+            attn_chunk_size = 2560
+            if m.shape[-3] <= attn_chunk_size:
+                m = self._attn_forward(m, attn_mask, bias)
+            else:
+                # reduce the peak memory cost in extra_msa_stack
+                return self._attn_chunk_forward(
+                    m, attn_mask, bias, chunk_size=attn_chunk_size)
+
+        return m
+
+    def get_output_bias(self):
+        return self.mha.get_output_bias()
+
+
+class MSARowAttentionWithPairBias(MSAAttention):
+
+    def __init__(self, d_msa, d_pair, d_hid, num_heads):
+        super(MSARowAttentionWithPairBias, self).__init__(
+            d_msa,
+            d_hid,
+            num_heads,
+            pair_bias=True,
+            d_pair=d_pair,
+        )
+
+
+class MSAColumnAttention(MSAAttention):
+
+    def __init__(self, d_msa, d_hid, num_heads):
+        super(MSAColumnAttention, self).__init__(
+            d_in=d_msa,
+            d_hid=d_hid,
+            num_heads=num_heads,
+            pair_bias=False,
+            d_pair=None,
+        )
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        m = m.transpose(-2, -3)
+        m = super().forward(m, attn_mask=attn_mask, chunk_size=chunk_size)
+        m = m.transpose(-2, -3)
+
+        return m
+
+
+class MSAColumnGlobalAttention(nn.Module):
+
+    def __init__(
+        self,
+        d_in,
+        d_hid,
+        num_heads,
+        inf=1e9,
+        eps=1e-10,
+    ):
+        super(MSAColumnGlobalAttention, self).__init__()
+
+        self.layer_norm_m = LayerNorm(d_in)
+        self.global_attention = GlobalAttention(
+            d_in,
+            d_hid,
+            num_heads,
+            inf=inf,
+            eps=eps,
+        )
+
+    @torch.jit.ignore
+    def _chunk(
+        self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self._attn_forward,
+            {
+                'm': m,
+                'mask': mask
+            },
+            chunk_size=chunk_size,
+            num_batch_dims=len(m.shape[:-2]),
+        )
+
+    def _attn_forward(self, m, mask):
+        m = self.layer_norm_m(m)
+        return self.global_attention(m, mask=mask)
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        m = m.transpose(-2, -3)
+        mask = mask.transpose(-1, -2)
+
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size)
+        else:
+            m = self._attn_forward(m, mask=mask)
+
+        m = m.transpose(-2, -3)
+        return m
+
+
+def gen_tri_attn_mask(mask, inf):
+    start_mask = gen_attn_mask(mask, -inf)[..., :, None, None, :]
+    end_mask = gen_attn_mask(mask.transpose(-1, -2), -inf)[..., :, None,
+                                                           None, :]
+    return start_mask, end_mask
+
+
+class TriangleAttention(nn.Module):
+
+    def __init__(
+        self,
+        d_in,
+        d_hid,
+        num_heads,
+        starting,
+    ):
+        super(TriangleAttention, self).__init__()
+        self.starting = starting
+        self.layer_norm = LayerNorm(d_in)
+        self.linear = Linear(d_in, num_heads, bias=False, init='normal')
+        self.mha = Attention(d_in, d_in, d_in, d_hid, num_heads)
+
+    @torch.jit.ignore
+    def _chunk(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        chunk_size: int = None,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self.mha,
+            {
+                'q': x,
+                'k': x,
+                'v': x,
+                'mask': mask,
+                'bias': bias
+            },
+            chunk_size=chunk_size,
+            num_batch_dims=len(x.shape[:-2]),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        if not self.starting:
+            x = x.transpose(-2, -3)
+
+        x = self.layer_norm(x)
+        triangle_bias = (
+            permute_final_dims(self.linear(x),
+                               (2, 0, 1)).unsqueeze(-4).contiguous())
+
+        if chunk_size is not None:
+            x = self._chunk(x, attn_mask, triangle_bias, chunk_size)
+        else:
+            x = self.mha(q=x, k=x, v=x, mask=attn_mask, bias=triangle_bias)
+
+        if not self.starting:
+            x = x.transpose(-2, -3)
+        return x
+
+    def get_output_bias(self):
+        return self.mha.get_output_bias()
+
+
+class TriangleAttentionStarting(TriangleAttention):
+    __init__ = partialmethod(TriangleAttention.__init__, starting=True)
+
+
+class TriangleAttentionEnding(TriangleAttention):
+    __init__ = partialmethod(TriangleAttention.__init__, starting=False)
diff --git a/modelscope/models/science/unifold/modules/auxillary_heads.py b/modelscope/models/science/unifold/modules/auxillary_heads.py
new file mode 100644
index 00000000..2daf5d55
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/auxillary_heads.py
@@ -0,0 +1,171 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from typing import Dict
+
+import torch.nn as nn
+from unicore.modules import LayerNorm
+
+from .common import Linear
+from .confidence import (predicted_aligned_error, predicted_lddt,
+                         predicted_tm_score)
+
+
+class AuxiliaryHeads(nn.Module):
+
+    def __init__(self, config):
+        super(AuxiliaryHeads, self).__init__()
+
+        self.plddt = PredictedLDDTHead(**config['plddt'], )
+
+        self.distogram = DistogramHead(**config['distogram'], )
+
+        self.masked_msa = MaskedMSAHead(**config['masked_msa'], )
+
+        if config.experimentally_resolved.enabled:
+            self.experimentally_resolved = ExperimentallyResolvedHead(
+                **config['experimentally_resolved'], )
+
+        if config.pae.enabled:
+            self.pae = PredictedAlignedErrorHead(**config.pae, )
+
+        self.config = config
+
+    def forward(self, outputs):
+        aux_out = {}
+        plddt_logits = self.plddt(outputs['sm']['single'])
+        aux_out['plddt_logits'] = plddt_logits
+
+        aux_out['plddt'] = predicted_lddt(plddt_logits.detach())
+
+        distogram_logits = self.distogram(outputs['pair'])
+        aux_out['distogram_logits'] = distogram_logits
+
+        masked_msa_logits = self.masked_msa(outputs['msa'])
+        aux_out['masked_msa_logits'] = masked_msa_logits
+
+        if self.config.experimentally_resolved.enabled:
+            exp_res_logits = self.experimentally_resolved(outputs['single'])
+            aux_out['experimentally_resolved_logits'] = exp_res_logits
+
+        if self.config.pae.enabled:
+            pae_logits = self.pae(outputs['pair'])
+            aux_out['pae_logits'] = pae_logits
+            pae_logits = pae_logits.detach()
+            aux_out.update(
+                predicted_aligned_error(
+                    pae_logits,
+                    **self.config.pae,
+                ))
+            aux_out['ptm'] = predicted_tm_score(
+                pae_logits, interface=False, **self.config.pae)
+
+            iptm_weight = self.config.pae.get('iptm_weight', 0.0)
+            if iptm_weight > 0.0:
+                aux_out['iptm'] = predicted_tm_score(
+                    pae_logits,
+                    interface=True,
+                    asym_id=outputs['asym_id'],
+                    **self.config.pae,
+                )
+                aux_out['iptm+ptm'] = (
+                    iptm_weight * aux_out['iptm'] +  # noqa W504
+                    (1.0 - iptm_weight) * aux_out['ptm'])
+
+        return aux_out
+
+
+class PredictedLDDTHead(nn.Module):
+
+    def __init__(self, num_bins, d_in, d_hid):
+        super(PredictedLDDTHead, self).__init__()
+
+        self.num_bins = num_bins
+        self.d_in = d_in
+        self.d_hid = d_hid
+
+        self.layer_norm = LayerNorm(self.d_in)
+
+        self.linear_1 = Linear(self.d_in, self.d_hid, init='relu')
+        self.linear_2 = Linear(self.d_hid, self.d_hid, init='relu')
+        self.act = nn.GELU()
+        self.linear_3 = Linear(self.d_hid, self.num_bins, init='final')
+
+    def forward(self, s):
+        s = self.layer_norm(s)
+        s = self.linear_1(s)
+        s = self.act(s)
+        s = self.linear_2(s)
+        s = self.act(s)
+        s = self.linear_3(s)
+        return s
+
+
+class EnhancedHeadBase(nn.Module):
+
+    def __init__(self, d_in, d_out, disable_enhance_head):
+        super(EnhancedHeadBase, self).__init__()
+        if disable_enhance_head:
+            self.layer_norm = None
+            self.linear_in = None
+        else:
+            self.layer_norm = LayerNorm(d_in)
+            self.linear_in = Linear(d_in, d_in, init='relu')
+        self.act = nn.GELU()
+        self.linear = Linear(d_in, d_out, init='final')
+
+    def apply_alphafold_original_mode(self):
+        self.layer_norm = None
+        self.linear_in = None
+
+    def forward(self, x):
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+            x = self.act(self.linear_in(x))
+        logits = self.linear(x)
+        return logits
+
+
+class DistogramHead(EnhancedHeadBase):
+
+    def __init__(self, d_pair, num_bins, disable_enhance_head, **kwargs):
+        super(DistogramHead, self).__init__(
+            d_in=d_pair,
+            d_out=num_bins,
+            disable_enhance_head=disable_enhance_head,
+        )
+
+    def forward(self, x):
+        logits = super().forward(x)
+        logits = logits + logits.transpose(-2, -3)
+        return logits
+
+
+class PredictedAlignedErrorHead(EnhancedHeadBase):
+
+    def __init__(self, d_pair, num_bins, disable_enhance_head, **kwargs):
+        super(PredictedAlignedErrorHead, self).__init__(
+            d_in=d_pair,
+            d_out=num_bins,
+            disable_enhance_head=disable_enhance_head,
+        )
+
+
+class MaskedMSAHead(EnhancedHeadBase):
+
+    def __init__(self, d_msa, d_out, disable_enhance_head, **kwargs):
+        super(MaskedMSAHead, self).__init__(
+            d_in=d_msa,
+            d_out=d_out,
+            disable_enhance_head=disable_enhance_head,
+        )
+
+
+class ExperimentallyResolvedHead(EnhancedHeadBase):
+
+    def __init__(self, d_single, d_out, disable_enhance_head, **kwargs):
+        super(ExperimentallyResolvedHead, self).__init__(
+            d_in=d_single,
+            d_out=d_out,
+            disable_enhance_head=disable_enhance_head,
+        )
diff --git a/modelscope/models/science/unifold/modules/common.py b/modelscope/models/science/unifold/modules/common.py
new file mode 100644
index 00000000..186f2567
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/common.py
@@ -0,0 +1,387 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from unicore.modules import LayerNorm
+from unicore.utils import tensor_tree_map
+
+
+class Linear(nn.Linear):
+
+    def __init__(
+        self,
+        d_in: int,
+        d_out: int,
+        bias: bool = True,
+        init: str = 'default',
+    ):
+        super(Linear, self).__init__(d_in, d_out, bias=bias)
+
+        self.use_bias = bias
+
+        if self.use_bias:
+            with torch.no_grad():
+                self.bias.fill_(0)
+
+        if init == 'default':
+            self._trunc_normal_init(1.0)
+        elif init == 'relu':
+            self._trunc_normal_init(2.0)
+        elif init == 'glorot':
+            self._glorot_uniform_init()
+        elif init == 'gating':
+            self._zero_init(self.use_bias)
+        elif init == 'normal':
+            self._normal_init()
+        elif init == 'final':
+            self._zero_init(False)
+        else:
+            raise ValueError('Invalid init method.')
+
+    def _trunc_normal_init(self, scale=1.0):
+        # Constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+        TRUNCATED_NORMAL_STDDEV_FACTOR = 0.87962566103423978
+        _, fan_in = self.weight.shape
+        scale = scale / max(1, fan_in)
+        std = (scale**0.5) / TRUNCATED_NORMAL_STDDEV_FACTOR
+        nn.init.trunc_normal_(self.weight, mean=0.0, std=std)
+
+    def _glorot_uniform_init(self):
+        nn.init.xavier_uniform_(self.weight, gain=1)
+
+    def _zero_init(self, use_bias=True):
+        with torch.no_grad():
+            self.weight.fill_(0.0)
+            if use_bias:
+                with torch.no_grad():
+                    self.bias.fill_(1.0)
+
+    def _normal_init(self):
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity='linear')
+
+
+class Transition(nn.Module):
+
+    def __init__(self, d_in, n):
+
+        super(Transition, self).__init__()
+
+        self.d_in = d_in
+        self.n = n
+
+        self.layer_norm = LayerNorm(self.d_in)
+        self.linear_1 = Linear(self.d_in, self.n * self.d_in, init='relu')
+        self.act = nn.GELU()
+        self.linear_2 = Linear(self.n * self.d_in, d_in, init='final')
+
+    def _transition(self, x):
+        x = self.layer_norm(x)
+        x = self.linear_1(x)
+        x = self.act(x)
+        x = self.linear_2(x)
+        return x
+
+    @torch.jit.ignore
+    def _chunk(
+        self,
+        x: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self._transition,
+            {'x': x},
+            chunk_size=chunk_size,
+            num_batch_dims=len(x.shape[:-2]),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        if chunk_size is not None:
+            x = self._chunk(x, chunk_size)
+        else:
+            x = self._transition(x=x)
+
+        return x
+
+
+class OuterProductMean(nn.Module):
+
+    def __init__(self, d_msa, d_pair, d_hid, eps=1e-3):
+        super(OuterProductMean, self).__init__()
+
+        self.d_msa = d_msa
+        self.d_pair = d_pair
+        self.d_hid = d_hid
+        self.eps = eps
+
+        self.layer_norm = LayerNorm(d_msa)
+        self.linear_1 = Linear(d_msa, d_hid)
+        self.linear_2 = Linear(d_msa, d_hid)
+        self.linear_out = Linear(d_hid**2, d_pair, init='relu')
+        self.act = nn.GELU()
+        self.linear_z = Linear(self.d_pair, self.d_pair, init='final')
+        self.layer_norm_out = LayerNorm(self.d_pair)
+
+    def _opm(self, a, b):
+        outer = torch.einsum('...bac,...dae->...bdce', a, b)
+        outer = outer.reshape(outer.shape[:-2] + (-1, ))
+        outer = self.linear_out(outer)
+        return outer
+
+    @torch.jit.ignore
+    def _chunk(self, a: torch.Tensor, b: torch.Tensor,
+               chunk_size: int) -> torch.Tensor:
+        a = a.reshape((-1, ) + a.shape[-3:])
+        b = b.reshape((-1, ) + b.shape[-3:])
+        out = []
+        # TODO: optimize this
+        for a_prime, b_prime in zip(a, b):
+            outer = chunk_layer(
+                partial(self._opm, b=b_prime),
+                {'a': a_prime},
+                chunk_size=chunk_size,
+                num_batch_dims=1,
+            )
+            out.append(outer)
+        if len(out) == 1:
+            outer = out[0].unsqueeze(0)
+        else:
+            outer = torch.stack(out, dim=0)
+        outer = outer.reshape(a.shape[:-3] + outer.shape[1:])
+
+        return outer
+
+    def apply_alphafold_original_mode(self):
+        self.linear_z = None
+        self.layer_norm_out = None
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        m = self.layer_norm(m)
+        mask = mask.unsqueeze(-1)
+        if self.layer_norm_out is not None:
+            # for numerical stability
+            mask = mask * (mask.size(-2)**-0.5)
+        a = self.linear_1(m)
+        b = self.linear_2(m)
+        if self.training:
+            a = a * mask
+            b = b * mask
+        else:
+            a *= mask
+            b *= mask
+
+        a = a.transpose(-2, -3)
+        b = b.transpose(-2, -3)
+
+        if chunk_size is not None:
+            z = self._chunk(a, b, chunk_size)
+        else:
+            z = self._opm(a, b)
+
+        norm = torch.einsum('...abc,...adc->...bdc', mask, mask)
+        z /= self.eps + norm
+        if self.layer_norm_out is not None:
+            z = self.act(z)
+            z = self.layer_norm_out(z)
+            z = self.linear_z(z)
+        return z
+
+
+def residual(residual, x, training):
+    if training:
+        return x + residual
+    else:
+        residual += x
+        return residual
+
+
+@torch.jit.script
+def fused_bias_dropout_add(
+    x: torch.Tensor,
+    bias: torch.Tensor,
+    residual: torch.Tensor,
+    dropmask: torch.Tensor,
+    prob: float,
+) -> torch.Tensor:
+    return (x + bias) * F.dropout(dropmask, p=prob, training=True) + residual
+
+
+@torch.jit.script
+def fused_bias_dropout_add_inference(
+    x: torch.Tensor,
+    bias: torch.Tensor,
+    residual: torch.Tensor,
+) -> torch.Tensor:
+    residual += bias + x
+    return residual
+
+
+def bias_dropout_residual(module, residual, x, dropout_shared_dim, prob,
+                          training):
+    bias = module.get_output_bias()
+    if training:
+        shape = list(x.shape)
+        shape[dropout_shared_dim] = 1
+        with torch.no_grad():
+            mask = x.new_ones(shape)
+        return fused_bias_dropout_add(x, bias, residual, mask, prob)
+    else:
+        return fused_bias_dropout_add_inference(x, bias, residual)
+
+
+@torch.jit.script
+def fused_bias_gated_dropout_add(
+    x: torch.Tensor,
+    bias: torch.Tensor,
+    g: torch.Tensor,
+    g_bias: torch.Tensor,
+    residual: torch.Tensor,
+    dropout_mask: torch.Tensor,
+    prob: float,
+) -> torch.Tensor:
+    return (torch.sigmoid(g + g_bias) * (x + bias)) * F.dropout(
+        dropout_mask,
+        p=prob,
+        training=True,
+    ) + residual
+
+
+def tri_mul_residual(
+    module,
+    residual,
+    outputs,
+    dropout_shared_dim,
+    prob,
+    training,
+    block_size,
+):
+    if training:
+        x, g = outputs
+        bias, g_bias = module.get_output_bias()
+        shape = list(x.shape)
+        shape[dropout_shared_dim] = 1
+        with torch.no_grad():
+            mask = x.new_ones(shape)
+        return fused_bias_gated_dropout_add(
+            x,
+            bias,
+            g,
+            g_bias,
+            residual,
+            mask,
+            prob,
+        )
+    elif block_size is None:
+        x, g = outputs
+        bias, g_bias = module.get_output_bias()
+        residual += (torch.sigmoid(g + g_bias) * (x + bias))
+        return residual
+    else:
+        # gated is not used here
+        residual += outputs
+        return residual
+
+
+class SimpleModuleList(nn.ModuleList):
+
+    def __repr__(self):
+        return str(len(self)) + ' X ...\n' + self[0].__repr__()
+
+
+def chunk_layer(
+    layer: Callable,
+    inputs: Dict[str, Any],
+    chunk_size: int,
+    num_batch_dims: int,
+) -> Any:
+    # TODO: support inplace add to output
+    if not (len(inputs) > 0):
+        raise ValueError('Must provide at least one input')
+
+    def _dict_get_shapes(input):
+        shapes = []
+        if type(input) is torch.Tensor:
+            shapes.append(input.shape)
+        elif type(input) is dict:
+            for v in input.values():
+                shapes.extend(_dict_get_shapes(v))
+        elif isinstance(input, Iterable):
+            for v in input:
+                shapes.extend(_dict_get_shapes(v))
+        else:
+            raise ValueError('Not supported')
+
+        return shapes
+
+    inputs = {k: v for k, v in inputs.items() if v is not None}
+    initial_dims = [
+        shape[:num_batch_dims] for shape in _dict_get_shapes(inputs)
+    ]
+    orig_batch_dims = tuple([max(s) for s in zip(*initial_dims)])
+
+    flat_batch_dim = 1
+    for d in orig_batch_dims:
+        flat_batch_dim *= d
+    num_chunks = (flat_batch_dim + chunk_size - 1) // chunk_size
+
+    def _flat_inputs(t):
+        t = t.view(-1, *t.shape[num_batch_dims:])
+        assert (
+            t.shape[0] == flat_batch_dim or t.shape[0] == 1
+        ), 'batch dimension must be 1 or equal to the flat batch dimension'
+        return t
+
+    flat_inputs = tensor_tree_map(_flat_inputs, inputs)
+
+    out = None
+    for i in range(num_chunks):
+        chunk_start = i * chunk_size
+        chunk_end = min((i + 1) * chunk_size, flat_batch_dim)
+
+        def select_chunk(t):
+            if t.shape[0] == 1:
+                return t[0:1]
+            else:
+                return t[chunk_start:chunk_end]
+
+        chunkes = tensor_tree_map(select_chunk, flat_inputs)
+
+        output_chunk = layer(**chunkes)
+
+        if out is None:
+            out = tensor_tree_map(
+                lambda t: t.new_zeros((flat_batch_dim, ) + t.shape[1:]),
+                output_chunk)
+
+        out_type = type(output_chunk)
+        if out_type is tuple:
+            for x, y in zip(out, output_chunk):
+                x[chunk_start:chunk_end] = y
+        elif out_type is torch.Tensor:
+            out[chunk_start:chunk_end] = output_chunk
+        else:
+            raise ValueError('Not supported')
+
+    # reshape = lambda t: t.view(orig_batch_dims + t.shape[1:])
+    def reshape(t):
+        return t.view(orig_batch_dims + t.shape[1:])
+
+    out = tensor_tree_map(reshape, out)
+
+    return out
diff --git a/modelscope/models/science/unifold/modules/confidence.py b/modelscope/models/science/unifold/modules/confidence.py
new file mode 100644
index 00000000..7574689c
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/confidence.py
@@ -0,0 +1,159 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from typing import Dict, Optional, Tuple
+
+import torch
+
+
+def predicted_lddt(plddt_logits: torch.Tensor) -> torch.Tensor:
+    """Computes per-residue pLDDT from logits.
+    Args:
+        logits: [num_res, num_bins] output from the PredictedLDDTHead.
+    Returns:
+        plddt: [num_res] per-residue pLDDT.
+    """
+    num_bins = plddt_logits.shape[-1]
+    bin_probs = torch.nn.functional.softmax(plddt_logits.float(), dim=-1)
+    bin_width = 1.0 / num_bins
+    bounds = torch.arange(
+        start=0.5 * bin_width,
+        end=1.0,
+        step=bin_width,
+        device=plddt_logits.device)
+    plddt = torch.sum(
+        bin_probs
+        * bounds.view(*((1, ) * len(bin_probs.shape[:-1])), *bounds.shape),
+        dim=-1,
+    )
+    return plddt
+
+
+def compute_bin_values(breaks: torch.Tensor):
+    """Gets the bin centers from the bin edges.
+    Args:
+        breaks: [num_bins - 1] the error bin edges.
+    Returns:
+        bin_centers: [num_bins] the error bin centers.
+    """
+    step = breaks[1] - breaks[0]
+    bin_values = breaks + step / 2
+    bin_values = torch.cat([bin_values, (bin_values[-1] + step).unsqueeze(-1)],
+                           dim=0)
+    return bin_values
+
+
+def compute_predicted_aligned_error(
+    bin_edges: torch.Tensor,
+    bin_probs: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Calculates expected aligned distance errors for every pair of residues.
+    Args:
+        alignment_confidence_breaks: [num_bins - 1] the error bin edges.
+        aligned_distance_error_probs: [num_res, num_res, num_bins] the predicted
+        probs for each error bin, for each pair of residues.
+    Returns:
+        predicted_aligned_error: [num_res, num_res] the expected aligned distance
+        error for each pair of residues.
+        max_predicted_aligned_error: The maximum predicted error possible.
+    """
+    bin_values = compute_bin_values(bin_edges)
+    return torch.sum(bin_probs * bin_values, dim=-1)
+
+
+def predicted_aligned_error(
+    pae_logits: torch.Tensor,
+    max_bin: int = 31,
+    num_bins: int = 64,
+    **kwargs,
+) -> Dict[str, torch.Tensor]:
+    """Computes aligned confidence metrics from logits.
+    Args:
+        logits: [num_res, num_res, num_bins] the logits output from
+        PredictedAlignedErrorHead.
+        breaks: [num_bins - 1] the error bin edges.
+    Returns:
+        aligned_confidence_probs: [num_res, num_res, num_bins] the predicted
+        aligned error probabilities over bins for each residue pair.
+        predicted_aligned_error: [num_res, num_res] the expected aligned distance
+        error for each pair of residues.
+        max_predicted_aligned_error: The maximum predicted error possible.
+    """
+    bin_probs = torch.nn.functional.softmax(pae_logits.float(), dim=-1)
+    bin_edges = torch.linspace(
+        0, max_bin, steps=(num_bins - 1), device=pae_logits.device)
+
+    predicted_aligned_error = compute_predicted_aligned_error(
+        bin_edges=bin_edges,
+        bin_probs=bin_probs,
+    )
+
+    return {
+        'aligned_error_probs_per_bin': bin_probs,
+        'predicted_aligned_error': predicted_aligned_error,
+    }
+
+
+def predicted_tm_score(
+    pae_logits: torch.Tensor,
+    residue_weights: Optional[torch.Tensor] = None,
+    max_bin: int = 31,
+    num_bins: int = 64,
+    eps: float = 1e-8,
+    asym_id: Optional[torch.Tensor] = None,
+    interface: bool = False,
+    **kwargs,
+) -> torch.Tensor:
+    """Computes predicted TM alignment or predicted interface TM alignment score.
+    Args:
+        logits: [num_res, num_res, num_bins] the logits output from
+        PredictedAlignedErrorHead.
+        breaks: [num_bins] the error bins.
+        residue_weights: [num_res] the per residue weights to use for the
+        expectation.
+        asym_id: [num_res] the asymmetric unit ID - the chain ID. Only needed for
+        ipTM calculation, i.e. when interface=True.
+        interface: If True, interface predicted TM score is computed.
+    Returns:
+        ptm_score: The predicted TM alignment or the predicted iTM score.
+    """
+    pae_logits = pae_logits.float()
+    if residue_weights is None:
+        residue_weights = pae_logits.new_ones(pae_logits.shape[:-2])
+
+    breaks = torch.linspace(
+        0, max_bin, steps=(num_bins - 1), device=pae_logits.device)
+
+    def tm_kernal(nres):
+        clipped_n = max(nres, 19)
+        d0 = 1.24 * (clipped_n - 15)**(1.0 / 3.0) - 1.8
+        return lambda x: 1.0 / (1.0 + (x / d0)**2)
+
+    def rmsd_kernal(eps):  # leave for compute pRMS
+        return lambda x: 1. / (x + eps)
+
+    bin_centers = compute_bin_values(breaks)
+    probs = torch.nn.functional.softmax(pae_logits, dim=-1)
+    tm_per_bin = tm_kernal(nres=pae_logits.shape[-2])(bin_centers)
+    # tm_per_bin = 1.0 / (1 + (bin_centers ** 2) / (d0 ** 2))
+    # rmsd_per_bin = rmsd_kernal()(bin_centers)
+    predicted_tm_term = torch.sum(probs * tm_per_bin, dim=-1)
+
+    pair_mask = predicted_tm_term.new_ones(predicted_tm_term.shape)
+    if interface:
+        assert asym_id is not None, 'must provide asym_id for iptm calculation.'
+        pair_mask *= asym_id[..., :, None] != asym_id[..., None, :]
+
+    predicted_tm_term *= pair_mask
+
+    pair_residue_weights = pair_mask * (
+        residue_weights[None, :] * residue_weights[:, None])
+    normed_residue_mask = pair_residue_weights / (
+        eps + pair_residue_weights.sum(dim=-1, keepdim=True))
+
+    per_alignment = torch.sum(predicted_tm_term * normed_residue_mask, dim=-1)
+    weighted = per_alignment * residue_weights
+    ret = per_alignment.gather(
+        dim=-1, index=weighted.max(dim=-1,
+                                   keepdim=True).indices).squeeze(dim=-1)
+    return ret
diff --git a/modelscope/models/science/unifold/modules/embedders.py b/modelscope/models/science/unifold/modules/embedders.py
new file mode 100644
index 00000000..84e87e2d
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/embedders.py
@@ -0,0 +1,290 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from unicore.modules import LayerNorm
+from unicore.utils import one_hot
+
+from .common import Linear, SimpleModuleList, residual
+
+
+class InputEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        tf_dim: int,
+        msa_dim: int,
+        d_pair: int,
+        d_msa: int,
+        relpos_k: int,
+        use_chain_relative: bool = False,
+        max_relative_chain: Optional[int] = None,
+        **kwargs,
+    ):
+        super(InputEmbedder, self).__init__()
+
+        self.tf_dim = tf_dim
+        self.msa_dim = msa_dim
+
+        self.d_pair = d_pair
+        self.d_msa = d_msa
+
+        self.linear_tf_z_i = Linear(tf_dim, d_pair)
+        self.linear_tf_z_j = Linear(tf_dim, d_pair)
+        self.linear_tf_m = Linear(tf_dim, d_msa)
+        self.linear_msa_m = Linear(msa_dim, d_msa)
+
+        # RPE stuff
+        self.relpos_k = relpos_k
+        self.use_chain_relative = use_chain_relative
+        self.max_relative_chain = max_relative_chain
+        if not self.use_chain_relative:
+            self.num_bins = 2 * self.relpos_k + 1
+        else:
+            self.num_bins = 2 * self.relpos_k + 2
+            self.num_bins += 1  # entity id
+            self.num_bins += 2 * max_relative_chain + 2
+
+        self.linear_relpos = Linear(self.num_bins, d_pair)
+
+    def _relpos_indices(
+        self,
+        res_id: torch.Tensor,
+        sym_id: Optional[torch.Tensor] = None,
+        asym_id: Optional[torch.Tensor] = None,
+        entity_id: Optional[torch.Tensor] = None,
+    ):
+
+        max_rel_res = self.relpos_k
+        rp = res_id[..., None] - res_id[..., None, :]
+        rp = rp.clip(-max_rel_res, max_rel_res) + max_rel_res
+        if not self.use_chain_relative:
+            return rp
+        else:
+            asym_id_same = asym_id[..., :, None] == asym_id[..., None, :]
+            rp[~asym_id_same] = 2 * max_rel_res + 1
+            entity_id_same = entity_id[..., :, None] == entity_id[..., None, :]
+            rp_entity_id = entity_id_same.type(rp.dtype)[..., None]
+
+            rel_sym_id = sym_id[..., :, None] - sym_id[..., None, :]
+
+            max_rel_chain = self.max_relative_chain
+
+            clipped_rel_chain = torch.clamp(
+                rel_sym_id + max_rel_chain, min=0, max=2 * max_rel_chain)
+
+            clipped_rel_chain[~entity_id_same] = 2 * max_rel_chain + 1
+            return rp, rp_entity_id, clipped_rel_chain
+
+    def relpos_emb(
+        self,
+        res_id: torch.Tensor,
+        sym_id: Optional[torch.Tensor] = None,
+        asym_id: Optional[torch.Tensor] = None,
+        entity_id: Optional[torch.Tensor] = None,
+        num_sym: Optional[torch.Tensor] = None,
+    ):
+
+        dtype = self.linear_relpos.weight.dtype
+        if not self.use_chain_relative:
+            rp = self._relpos_indices(res_id=res_id)
+            return self.linear_relpos(
+                one_hot(rp, num_classes=self.num_bins, dtype=dtype))
+        else:
+            rp, rp_entity_id, rp_rel_chain = self._relpos_indices(
+                res_id=res_id,
+                sym_id=sym_id,
+                asym_id=asym_id,
+                entity_id=entity_id)
+            rp = one_hot(rp, num_classes=(2 * self.relpos_k + 2), dtype=dtype)
+            rp_entity_id = rp_entity_id.type(dtype)
+            rp_rel_chain = one_hot(
+                rp_rel_chain,
+                num_classes=(2 * self.max_relative_chain + 2),
+                dtype=dtype)
+            return self.linear_relpos(
+                torch.cat([rp, rp_entity_id, rp_rel_chain], dim=-1))
+
+    def forward(
+        self,
+        tf: torch.Tensor,
+        msa: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # [*, N_res, d_pair]
+        if self.tf_dim == 21:
+            # multimer use 21 target dim
+            tf = tf[..., 1:]
+        # convert type if necessary
+        tf = tf.type(self.linear_tf_z_i.weight.dtype)
+        msa = msa.type(self.linear_tf_z_i.weight.dtype)
+        n_clust = msa.shape[-3]
+
+        msa_emb = self.linear_msa_m(msa)
+        # target_feat (aatype) into msa representation
+        tf_m = (
+            self.linear_tf_m(tf).unsqueeze(-3).expand(
+                ((-1, ) * len(tf.shape[:-2]) +  # noqa W504
+                 (n_clust, -1, -1))))
+        msa_emb += tf_m
+
+        tf_emb_i = self.linear_tf_z_i(tf)
+        tf_emb_j = self.linear_tf_z_j(tf)
+        pair_emb = tf_emb_i[..., None, :] + tf_emb_j[..., None, :, :]
+
+        return msa_emb, pair_emb
+
+
+class RecyclingEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        d_msa: int,
+        d_pair: int,
+        min_bin: float,
+        max_bin: float,
+        num_bins: int,
+        inf: float = 1e8,
+        **kwargs,
+    ):
+        super(RecyclingEmbedder, self).__init__()
+
+        self.d_msa = d_msa
+        self.d_pair = d_pair
+        self.min_bin = min_bin
+        self.max_bin = max_bin
+        self.num_bins = num_bins
+        self.inf = inf
+
+        self.squared_bins = None
+
+        self.linear = Linear(self.num_bins, self.d_pair)
+        self.layer_norm_m = LayerNorm(self.d_msa)
+        self.layer_norm_z = LayerNorm(self.d_pair)
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        m_update = self.layer_norm_m(m)
+        z_update = self.layer_norm_z(z)
+
+        return m_update, z_update
+
+    def recyle_pos(
+        self,
+        x: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        if self.squared_bins is None:
+            bins = torch.linspace(
+                self.min_bin,
+                self.max_bin,
+                self.num_bins,
+                dtype=torch.float if self.training else x.dtype,
+                device=x.device,
+                requires_grad=False,
+            )
+            self.squared_bins = bins**2
+        upper = torch.cat(
+            [self.squared_bins[1:],
+             self.squared_bins.new_tensor([self.inf])],
+            dim=-1)
+        if self.training:
+            x = x.float()
+        d = torch.sum(
+            (x[..., None, :] - x[..., None, :, :])**2, dim=-1, keepdims=True)
+        d = ((d > self.squared_bins) *  # noqa W504
+             (d < upper)).type(self.linear.weight.dtype)
+        d = self.linear(d)
+        return d
+
+
+class TemplateAngleEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        d_in: int,
+        d_out: int,
+        **kwargs,
+    ):
+        super(TemplateAngleEmbedder, self).__init__()
+
+        self.d_out = d_out
+        self.d_in = d_in
+
+        self.linear_1 = Linear(self.d_in, self.d_out, init='relu')
+        self.act = nn.GELU()
+        self.linear_2 = Linear(self.d_out, self.d_out, init='relu')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear_1(x.type(self.linear_1.weight.dtype))
+        x = self.act(x)
+        x = self.linear_2(x)
+        return x
+
+
+class TemplatePairEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        d_in: int,
+        v2_d_in: list,
+        d_out: int,
+        d_pair: int,
+        v2_feature: bool = False,
+        **kwargs,
+    ):
+        super(TemplatePairEmbedder, self).__init__()
+
+        self.d_out = d_out
+        self.v2_feature = v2_feature
+        if self.v2_feature:
+            self.d_in = v2_d_in
+            self.linear = SimpleModuleList()
+            for d_in in self.d_in:
+                self.linear.append(Linear(d_in, self.d_out, init='relu'))
+            self.z_layer_norm = LayerNorm(d_pair)
+            self.z_linear = Linear(d_pair, self.d_out, init='relu')
+        else:
+            self.d_in = d_in
+            self.linear = Linear(self.d_in, self.d_out, init='relu')
+
+    def forward(
+        self,
+        x,
+        z,
+    ) -> torch.Tensor:
+        if not self.v2_feature:
+            x = self.linear(x.type(self.linear.weight.dtype))
+            return x
+        else:
+            dtype = self.z_linear.weight.dtype
+            t = self.linear[0](x[0].type(dtype))
+            for i, s in enumerate(x[1:]):
+                t = residual(t, self.linear[i + 1](s.type(dtype)),
+                             self.training)
+            t = residual(t, self.z_linear(self.z_layer_norm(z)), self.training)
+            return t
+
+
+class ExtraMSAEmbedder(nn.Module):
+
+    def __init__(
+        self,
+        d_in: int,
+        d_out: int,
+        **kwargs,
+    ):
+        super(ExtraMSAEmbedder, self).__init__()
+
+        self.d_in = d_in
+        self.d_out = d_out
+        self.linear = Linear(self.d_in, self.d_out)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(x.type(self.linear.weight.dtype))
diff --git a/modelscope/models/science/unifold/modules/evoformer.py b/modelscope/models/science/unifold/modules/evoformer.py
new file mode 100644
index 00000000..b0834986
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/evoformer.py
@@ -0,0 +1,362 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from functools import partial
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from unicore.utils import checkpoint_sequential
+
+from .attentions import (MSAColumnAttention, MSAColumnGlobalAttention,
+                         MSARowAttentionWithPairBias, TriangleAttentionEnding,
+                         TriangleAttentionStarting)
+from .common import (Linear, OuterProductMean, SimpleModuleList, Transition,
+                     bias_dropout_residual, residual, tri_mul_residual)
+from .triangle_multiplication import (TriangleMultiplicationIncoming,
+                                      TriangleMultiplicationOutgoing)
+
+
+class EvoformerIteration(nn.Module):
+
+    def __init__(
+        self,
+        d_msa: int,
+        d_pair: int,
+        d_hid_msa_att: int,
+        d_hid_opm: int,
+        d_hid_mul: int,
+        d_hid_pair_att: int,
+        num_heads_msa: int,
+        num_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        outer_product_mean_first: bool,
+        inf: float,
+        eps: float,
+        _is_extra_msa_stack: bool = False,
+    ):
+        super(EvoformerIteration, self).__init__()
+
+        self._is_extra_msa_stack = _is_extra_msa_stack
+        self.outer_product_mean_first = outer_product_mean_first
+
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            d_msa=d_msa,
+            d_pair=d_pair,
+            d_hid=d_hid_msa_att,
+            num_heads=num_heads_msa,
+        )
+
+        if _is_extra_msa_stack:
+            self.msa_att_col = MSAColumnGlobalAttention(
+                d_in=d_msa,
+                d_hid=d_hid_msa_att,
+                num_heads=num_heads_msa,
+                inf=inf,
+                eps=eps,
+            )
+        else:
+            self.msa_att_col = MSAColumnAttention(
+                d_msa,
+                d_hid_msa_att,
+                num_heads_msa,
+            )
+
+        self.msa_transition = Transition(
+            d_in=d_msa,
+            n=transition_n,
+        )
+
+        self.outer_product_mean = OuterProductMean(
+            d_msa,
+            d_pair,
+            d_hid_opm,
+        )
+
+        self.tri_mul_out = TriangleMultiplicationOutgoing(
+            d_pair,
+            d_hid_mul,
+        )
+        self.tri_mul_in = TriangleMultiplicationIncoming(
+            d_pair,
+            d_hid_mul,
+        )
+
+        self.tri_att_start = TriangleAttentionStarting(
+            d_pair,
+            d_hid_pair_att,
+            num_heads_pair,
+        )
+        self.tri_att_end = TriangleAttentionEnding(
+            d_pair,
+            d_hid_pair_att,
+            num_heads_pair,
+        )
+
+        self.pair_transition = Transition(
+            d_in=d_pair,
+            n=transition_n,
+        )
+
+        self.row_dropout_share_dim = -3
+        self.col_dropout_share_dim = -2
+        self.msa_dropout = msa_dropout
+        self.pair_dropout = pair_dropout
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        msa_row_attn_mask: torch.Tensor,
+        msa_col_attn_mask: Optional[torch.Tensor],
+        tri_start_attn_mask: torch.Tensor,
+        tri_end_attn_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        block_size: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        if self.outer_product_mean_first:
+            z = residual(
+                z,
+                self.outer_product_mean(
+                    m, mask=msa_mask, chunk_size=chunk_size), self.training)
+
+        m = bias_dropout_residual(
+            self.msa_att_row,
+            m,
+            self.msa_att_row(
+                m, z=z, attn_mask=msa_row_attn_mask, chunk_size=chunk_size),
+            self.row_dropout_share_dim,
+            self.msa_dropout,
+            self.training,
+        )
+        if self._is_extra_msa_stack:
+            m = residual(
+                m, self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size),
+                self.training)
+        else:
+            m = bias_dropout_residual(
+                self.msa_att_col,
+                m,
+                self.msa_att_col(
+                    m, attn_mask=msa_col_attn_mask, chunk_size=chunk_size),
+                self.col_dropout_share_dim,
+                self.msa_dropout,
+                self.training,
+            )
+        m = residual(m, self.msa_transition(m, chunk_size=chunk_size),
+                     self.training)
+        if not self.outer_product_mean_first:
+            z = residual(
+                z,
+                self.outer_product_mean(
+                    m, mask=msa_mask, chunk_size=chunk_size), self.training)
+
+        z = tri_mul_residual(
+            self.tri_mul_out,
+            z,
+            self.tri_mul_out(z, mask=pair_mask, block_size=block_size),
+            self.row_dropout_share_dim,
+            self.pair_dropout,
+            self.training,
+            block_size=block_size,
+        )
+
+        z = tri_mul_residual(
+            self.tri_mul_in,
+            z,
+            self.tri_mul_in(z, mask=pair_mask, block_size=block_size),
+            self.row_dropout_share_dim,
+            self.pair_dropout,
+            self.training,
+            block_size=block_size,
+        )
+
+        z = bias_dropout_residual(
+            self.tri_att_start,
+            z,
+            self.tri_att_start(
+                z, attn_mask=tri_start_attn_mask, chunk_size=chunk_size),
+            self.row_dropout_share_dim,
+            self.pair_dropout,
+            self.training,
+        )
+
+        z = bias_dropout_residual(
+            self.tri_att_end,
+            z,
+            self.tri_att_end(
+                z, attn_mask=tri_end_attn_mask, chunk_size=chunk_size),
+            self.col_dropout_share_dim,
+            self.pair_dropout,
+            self.training,
+        )
+        z = residual(z, self.pair_transition(z, chunk_size=chunk_size),
+                     self.training)
+        return m, z
+
+
+class EvoformerStack(nn.Module):
+
+    def __init__(
+        self,
+        d_msa: int,
+        d_pair: int,
+        d_hid_msa_att: int,
+        d_hid_opm: int,
+        d_hid_mul: int,
+        d_hid_pair_att: int,
+        d_single: int,
+        num_heads_msa: int,
+        num_heads_pair: int,
+        num_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        outer_product_mean_first: bool,
+        inf: float,
+        eps: float,
+        _is_extra_msa_stack: bool = False,
+        **kwargs,
+    ):
+        super(EvoformerStack, self).__init__()
+
+        self._is_extra_msa_stack = _is_extra_msa_stack
+
+        self.blocks = SimpleModuleList()
+
+        for _ in range(num_blocks):
+            self.blocks.append(
+                EvoformerIteration(
+                    d_msa=d_msa,
+                    d_pair=d_pair,
+                    d_hid_msa_att=d_hid_msa_att,
+                    d_hid_opm=d_hid_opm,
+                    d_hid_mul=d_hid_mul,
+                    d_hid_pair_att=d_hid_pair_att,
+                    num_heads_msa=num_heads_msa,
+                    num_heads_pair=num_heads_pair,
+                    transition_n=transition_n,
+                    msa_dropout=msa_dropout,
+                    pair_dropout=pair_dropout,
+                    outer_product_mean_first=outer_product_mean_first,
+                    inf=inf,
+                    eps=eps,
+                    _is_extra_msa_stack=_is_extra_msa_stack,
+                ))
+        if not self._is_extra_msa_stack:
+            self.linear = Linear(d_msa, d_single)
+        else:
+            self.linear = None
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        msa_row_attn_mask: torch.Tensor,
+        msa_col_attn_mask: torch.Tensor,
+        tri_start_attn_mask: torch.Tensor,
+        tri_end_attn_mask: torch.Tensor,
+        chunk_size: int,
+        block_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        blocks = [
+            partial(
+                b,
+                msa_mask=msa_mask,
+                pair_mask=pair_mask,
+                msa_row_attn_mask=msa_row_attn_mask,
+                msa_col_attn_mask=msa_col_attn_mask,
+                tri_start_attn_mask=tri_start_attn_mask,
+                tri_end_attn_mask=tri_end_attn_mask,
+                chunk_size=chunk_size,
+                block_size=block_size) for b in self.blocks
+        ]
+
+        m, z = checkpoint_sequential(
+            blocks,
+            input=(m, z),
+        )
+
+        s = None
+        if not self._is_extra_msa_stack:
+            seq_dim = -3
+            index = torch.tensor([0], device=m.device)
+            s = self.linear(torch.index_select(m, dim=seq_dim, index=index))
+            s = s.squeeze(seq_dim)
+
+        return m, z, s
+
+
+class ExtraMSAStack(EvoformerStack):
+
+    def __init__(
+        self,
+        d_msa: int,
+        d_pair: int,
+        d_hid_msa_att: int,
+        d_hid_opm: int,
+        d_hid_mul: int,
+        d_hid_pair_att: int,
+        num_heads_msa: int,
+        num_heads_pair: int,
+        num_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        outer_product_mean_first: bool,
+        inf: float,
+        eps: float,
+        **kwargs,
+    ):
+        super(ExtraMSAStack, self).__init__(
+            d_msa=d_msa,
+            d_pair=d_pair,
+            d_hid_msa_att=d_hid_msa_att,
+            d_hid_opm=d_hid_opm,
+            d_hid_mul=d_hid_mul,
+            d_hid_pair_att=d_hid_pair_att,
+            d_single=None,
+            num_heads_msa=num_heads_msa,
+            num_heads_pair=num_heads_pair,
+            num_blocks=num_blocks,
+            transition_n=transition_n,
+            msa_dropout=msa_dropout,
+            pair_dropout=pair_dropout,
+            outer_product_mean_first=outer_product_mean_first,
+            inf=inf,
+            eps=eps,
+            _is_extra_msa_stack=True,
+        )
+
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: Optional[torch.Tensor] = None,
+        pair_mask: Optional[torch.Tensor] = None,
+        msa_row_attn_mask: torch.Tensor = None,
+        msa_col_attn_mask: torch.Tensor = None,
+        tri_start_attn_mask: torch.Tensor = None,
+        tri_end_attn_mask: torch.Tensor = None,
+        chunk_size: int = None,
+        block_size: int = None,
+    ) -> torch.Tensor:
+        _, z, _ = super().forward(
+            m,
+            z,
+            msa_mask=msa_mask,
+            pair_mask=pair_mask,
+            msa_row_attn_mask=msa_row_attn_mask,
+            msa_col_attn_mask=msa_col_attn_mask,
+            tri_start_attn_mask=tri_start_attn_mask,
+            tri_end_attn_mask=tri_end_attn_mask,
+            chunk_size=chunk_size,
+            block_size=block_size)
+        return z
diff --git a/modelscope/models/science/unifold/modules/featurization.py b/modelscope/models/science/unifold/modules/featurization.py
new file mode 100644
index 00000000..b62adc9d
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/featurization.py
@@ -0,0 +1,195 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from typing import Dict
+
+import torch
+import torch.nn as nn
+from unicore.utils import batched_gather, one_hot
+
+from modelscope.models.science.unifold.data import residue_constants as rc
+from .frame import Frame
+
+
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
+    is_gly = aatype == rc.restype_order['G']
+    ca_idx = rc.atom_order['CA']
+    cb_idx = rc.atom_order['CB']
+    pseudo_beta = torch.where(
+        is_gly[..., None].expand(*((-1, ) * len(is_gly.shape)), 3),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+
+    if all_atom_masks is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly,
+            all_atom_masks[..., ca_idx],
+            all_atom_masks[..., cb_idx],
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+
+
+def atom14_to_atom37(atom14, batch):
+    atom37_data = batched_gather(
+        atom14,
+        batch['residx_atom37_to_atom14'],
+        dim=-2,
+        num_batch_dims=len(atom14.shape[:-2]),
+    )
+
+    atom37_data = atom37_data * batch['atom37_atom_exists'][..., None]
+
+    return atom37_data
+
+
+def build_template_angle_feat(template_feats, v2_feature=False):
+    template_aatype = template_feats['template_aatype']
+    torsion_angles_sin_cos = template_feats['template_torsion_angles_sin_cos']
+    torsion_angles_mask = template_feats['template_torsion_angles_mask']
+    if not v2_feature:
+        alt_torsion_angles_sin_cos = template_feats[
+            'template_alt_torsion_angles_sin_cos']
+        template_angle_feat = torch.cat(
+            [
+                one_hot(template_aatype, 22),
+                torsion_angles_sin_cos.reshape(
+                    *torsion_angles_sin_cos.shape[:-2], 14),
+                alt_torsion_angles_sin_cos.reshape(
+                    *alt_torsion_angles_sin_cos.shape[:-2], 14),
+                torsion_angles_mask,
+            ],
+            dim=-1,
+        )
+        template_angle_mask = torsion_angles_mask[..., 2]
+    else:
+        chi_mask = torsion_angles_mask[..., 3:]
+        chi_angles_sin = torsion_angles_sin_cos[..., 3:, 0] * chi_mask
+        chi_angles_cos = torsion_angles_sin_cos[..., 3:, 1] * chi_mask
+        template_angle_feat = torch.cat(
+            [
+                one_hot(template_aatype, 22),
+                chi_angles_sin,
+                chi_angles_cos,
+                chi_mask,
+            ],
+            dim=-1,
+        )
+        template_angle_mask = chi_mask[..., 0]
+    return template_angle_feat, template_angle_mask
+
+
+def build_template_pair_feat(
+    batch,
+    min_bin,
+    max_bin,
+    num_bins,
+    eps=1e-20,
+    inf=1e8,
+):
+    template_mask = batch['template_pseudo_beta_mask']
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+
+    tpb = batch['template_pseudo_beta']
+    dgram = torch.sum(
+        (tpb[..., None, :] - tpb[..., None, :, :])**2, dim=-1, keepdim=True)
+    lower = torch.linspace(min_bin, max_bin, num_bins, device=tpb.device)**2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
+
+    to_concat = [dgram, template_mask_2d[..., None]]
+
+    aatype_one_hot = nn.functional.one_hot(
+        batch['template_aatype'],
+        rc.restype_num + 2,
+    )
+
+    n_res = batch['template_aatype'].shape[-1]
+    to_concat.append(aatype_one_hot[..., None, :, :].expand(
+        *aatype_one_hot.shape[:-2], n_res, -1, -1))
+    to_concat.append(aatype_one_hot[...,
+                                    None, :].expand(*aatype_one_hot.shape[:-2],
+                                                    -1, n_res, -1))
+
+    to_concat.append(template_mask_2d.new_zeros(*template_mask_2d.shape, 3))
+    to_concat.append(template_mask_2d[..., None])
+
+    act = torch.cat(to_concat, dim=-1)
+    act = act * template_mask_2d[..., None]
+
+    return act
+
+
+def build_template_pair_feat_v2(
+    batch,
+    min_bin,
+    max_bin,
+    num_bins,
+    multichain_mask_2d=None,
+    eps=1e-20,
+    inf=1e8,
+):
+    template_mask = batch['template_pseudo_beta_mask']
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+    if multichain_mask_2d is not None:
+        template_mask_2d *= multichain_mask_2d
+
+    tpb = batch['template_pseudo_beta']
+    dgram = torch.sum(
+        (tpb[..., None, :] - tpb[..., None, :, :])**2, dim=-1, keepdim=True)
+    lower = torch.linspace(min_bin, max_bin, num_bins, device=tpb.device)**2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
+    dgram *= template_mask_2d[..., None]
+    to_concat = [dgram, template_mask_2d[..., None]]
+
+    aatype_one_hot = one_hot(
+        batch['template_aatype'],
+        rc.restype_num + 2,
+    )
+
+    n_res = batch['template_aatype'].shape[-1]
+    to_concat.append(aatype_one_hot[..., None, :, :].expand(
+        *aatype_one_hot.shape[:-2], n_res, -1, -1))
+    to_concat.append(aatype_one_hot[...,
+                                    None, :].expand(*aatype_one_hot.shape[:-2],
+                                                    -1, n_res, -1))
+
+    n, ca, c = [rc.atom_order[a] for a in ['N', 'CA', 'C']]
+    rigids = Frame.make_transform_from_reference(
+        n_xyz=batch['template_all_atom_positions'][..., n, :],
+        ca_xyz=batch['template_all_atom_positions'][..., ca, :],
+        c_xyz=batch['template_all_atom_positions'][..., c, :],
+        eps=eps,
+    )
+    points = rigids.get_trans()[..., None, :, :]
+    rigid_vec = rigids[..., None].invert_apply(points)
+
+    inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec**2, dim=-1))
+
+    t_aa_masks = batch['template_all_atom_mask']
+    backbone_mask = t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[...,
+                                                                          c]
+    backbone_mask_2d = backbone_mask[..., :, None] * backbone_mask[...,
+                                                                   None, :]
+    if multichain_mask_2d is not None:
+        backbone_mask_2d *= multichain_mask_2d
+
+    inv_distance_scalar = inv_distance_scalar * backbone_mask_2d
+    unit_vector_data = rigid_vec * inv_distance_scalar[..., None]
+    to_concat.extend(torch.unbind(unit_vector_data[..., None, :], dim=-1))
+    to_concat.append(backbone_mask_2d[..., None])
+
+    return to_concat
+
+
+def build_extra_msa_feat(batch):
+    msa_1hot = one_hot(batch['extra_msa'], 23)
+    msa_feat = [
+        msa_1hot,
+        batch['extra_msa_has_deletion'].unsqueeze(-1),
+        batch['extra_msa_deletion_value'].unsqueeze(-1),
+    ]
+    return torch.cat(msa_feat, dim=-1)
diff --git a/modelscope/models/science/unifold/modules/frame.py b/modelscope/models/science/unifold/modules/frame.py
new file mode 100644
index 00000000..5a0e4d6a
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/frame.py
@@ -0,0 +1,562 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from __future__ import annotations  # noqa
+from typing import Any, Callable, Iterable, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+
+
+def zero_translation(
+    batch_dims: Tuple[int],
+    dtype: Optional[torch.dtype] = torch.float,
+    device: Optional[torch.device] = torch.device('cpu'),
+    requires_grad: bool = False,
+) -> torch.Tensor:
+    trans = torch.zeros((*batch_dims, 3),
+                        dtype=dtype,
+                        device=device,
+                        requires_grad=requires_grad)
+    return trans
+
+
+# pylint: disable=bad-whitespace
+_QUAT_TO_ROT = np.zeros((4, 4, 3, 3), dtype=np.float32)
+
+_QUAT_TO_ROT[0, 0] = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]  # rr
+_QUAT_TO_ROT[1, 1] = [[1, 0, 0], [0, -1, 0], [0, 0, -1]]  # ii
+_QUAT_TO_ROT[2, 2] = [[-1, 0, 0], [0, 1, 0], [0, 0, -1]]  # jj
+_QUAT_TO_ROT[3, 3] = [[-1, 0, 0], [0, -1, 0], [0, 0, 1]]  # kk
+
+_QUAT_TO_ROT[1, 2] = [[0, 2, 0], [2, 0, 0], [0, 0, 0]]  # ij
+_QUAT_TO_ROT[1, 3] = [[0, 0, 2], [0, 0, 0], [2, 0, 0]]  # ik
+_QUAT_TO_ROT[2, 3] = [[0, 0, 0], [0, 0, 2], [0, 2, 0]]  # jk
+
+_QUAT_TO_ROT[0, 1] = [[0, 0, 0], [0, 0, -2], [0, 2, 0]]  # ir
+_QUAT_TO_ROT[0, 2] = [[0, 0, 2], [0, 0, 0], [-2, 0, 0]]  # jr
+_QUAT_TO_ROT[0, 3] = [[0, -2, 0], [2, 0, 0], [0, 0, 0]]  # kr
+
+_QUAT_TO_ROT = _QUAT_TO_ROT.reshape(4, 4, 9)
+_QUAT_TO_ROT_tensor = torch.from_numpy(_QUAT_TO_ROT)
+
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0],
+                           [0, 0, 0, -1]]
+
+_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1],
+                           [0, 0, -1, 0]]
+
+_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0],
+                           [0, 1, 0, 0]]
+
+_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0],
+                           [1, 0, 0, 0]]
+
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+_QUAT_MULTIPLY_BY_VEC_tensor = torch.from_numpy(_QUAT_MULTIPLY_BY_VEC)
+
+
+class Rotation:
+
+    def __init__(
+        self,
+        mat: torch.Tensor,
+    ):
+        if mat.shape[-2:] != (3, 3):
+            raise ValueError(f'incorrect rotation shape: {mat.shape}')
+        self._mat = mat
+
+    @staticmethod
+    def identity(
+        shape,
+        dtype: Optional[torch.dtype] = torch.float,
+        device: Optional[torch.device] = torch.device('cpu'),
+        requires_grad: bool = False,
+    ) -> Rotation:
+        mat = torch.eye(
+            3, dtype=dtype, device=device, requires_grad=requires_grad)
+        mat = mat.view(*((1, ) * len(shape)), 3, 3)
+        mat = mat.expand(*shape, -1, -1)
+        return Rotation(mat)
+
+    @staticmethod
+    def mat_mul_mat(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return (a.float() @ b.float()).type(a.dtype)
+
+    @staticmethod
+    def mat_mul_vec(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        return (r.float() @ t.float().unsqueeze(-1)).squeeze(-1).type(t.dtype)
+
+    def __getitem__(self, index: Any) -> Rotation:
+        if not isinstance(index, tuple):
+            index = (index, )
+        return Rotation(mat=self._mat[index + (slice(None), slice(None))])
+
+    def __mul__(self, right: Any) -> Rotation:
+        if isinstance(right, (int, float)):
+            return Rotation(mat=self._mat * right)
+        elif isinstance(right, torch.Tensor):
+            return Rotation(mat=self._mat * right[..., None, None])
+        else:
+            raise TypeError(
+                f'multiplicand must be a tensor or a number, got {type(right)}.'
+            )
+
+    def __rmul__(self, left: Any) -> Rotation:
+        return self.__mul__(left)
+
+    def __matmul__(self, other: Rotation) -> Rotation:
+        new_mat = Rotation.mat_mul_mat(self.rot_mat, other.rot_mat)
+        return Rotation(mat=new_mat)
+
+    @property
+    def _inv_mat(self):
+        return self._mat.transpose(-1, -2)
+
+    @property
+    def rot_mat(self) -> torch.Tensor:
+        return self._mat
+
+    def invert(self) -> Rotation:
+        return Rotation(mat=self._inv_mat)
+
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        return Rotation.mat_mul_vec(self._mat, pts)
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        return Rotation.mat_mul_vec(self._inv_mat, pts)
+
+    # inherit tensor behaviors
+    @property
+    def shape(self) -> torch.Size:
+        s = self._mat.shape[:-2]
+        return s
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self._mat.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self._mat.device
+
+    @property
+    def requires_grad(self) -> bool:
+        return self._mat.requires_grad
+
+    def unsqueeze(self, dim: int) -> Rotation:
+        if dim >= len(self.shape):
+            raise ValueError('Invalid dimension')
+
+        rot_mats = self._mat.unsqueeze(dim if dim >= 0 else dim - 2)
+        return Rotation(mat=rot_mats)
+
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor],
+                                         torch.Tensor]) -> Rotation:
+        mat = self._mat.view(self._mat.shape[:-2] + (9, ))
+        mat = torch.stack(list(map(fn, torch.unbind(mat, dim=-1))), dim=-1)
+        mat = mat.view(mat.shape[:-1] + (3, 3))
+        return Rotation(mat=mat)
+
+    @staticmethod
+    def cat(rs: Sequence[Rotation], dim: int) -> Rotation:
+        rot_mats = [r.rot_mat for r in rs]
+        rot_mats = torch.cat(rot_mats, dim=dim if dim >= 0 else dim - 2)
+
+        return Rotation(mat=rot_mats)
+
+    def cuda(self) -> Rotation:
+        return Rotation(mat=self._mat.cuda())
+
+    def to(self, device: Optional[torch.device],
+           dtype: Optional[torch.dtype]) -> Rotation:
+        return Rotation(mat=self._mat.to(device=device, dtype=dtype))
+
+    def type(self, dtype: Optional[torch.dtype]) -> Rotation:
+        return Rotation(mat=self._mat.type(dtype))
+
+    def detach(self) -> Rotation:
+        return Rotation(mat=self._mat.detach())
+
+
+class Frame:
+
+    def __init__(
+        self,
+        rotation: Optional[Rotation],
+        translation: Optional[torch.Tensor],
+    ):
+        if rotation is None and translation is None:
+            rotation = Rotation.identity((0, ))
+            translation = zero_translation((0, ))
+        elif translation is None:
+            translation = zero_translation(rotation.shape, rotation.dtype,
+                                           rotation.device,
+                                           rotation.requires_grad)
+
+        elif rotation is None:
+            rotation = Rotation.identity(
+                translation.shape[:-1],
+                translation.dtype,
+                translation.device,
+                translation.requires_grad,
+            )
+
+        if (rotation.shape != translation.shape[:-1]) or (rotation.device
+                                                          !=  # noqa W504
+                                                          translation.device):
+            raise ValueError('RotationMatrix and translation incompatible')
+
+        self._r = rotation
+        self._t = translation
+
+    @staticmethod
+    def identity(
+        shape: Iterable[int],
+        dtype: Optional[torch.dtype] = torch.float,
+        device: Optional[torch.device] = torch.device('cpu'),
+        requires_grad: bool = False,
+    ) -> Frame:
+        return Frame(
+            Rotation.identity(shape, dtype, device, requires_grad),
+            zero_translation(shape, dtype, device, requires_grad),
+        )
+
+    def __getitem__(
+        self,
+        index: Any,
+    ) -> Frame:
+        if type(index) != tuple:
+            index = (index, )
+
+        return Frame(
+            self._r[index],
+            self._t[index + (slice(None), )],
+        )
+
+    def __mul__(
+        self,
+        right: torch.Tensor,
+    ) -> Frame:
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError('The other multiplicand must be a Tensor')
+
+        new_rots = self._r * right
+        new_trans = self._t * right[..., None]
+
+        return Frame(new_rots, new_trans)
+
+    def __rmul__(
+        self,
+        left: torch.Tensor,
+    ) -> Frame:
+        return self.__mul__(left)
+
+    @property
+    def shape(self) -> torch.Size:
+        s = self._t.shape[:-1]
+        return s
+
+    @property
+    def device(self) -> torch.device:
+        return self._t.device
+
+    def get_rots(self) -> Rotation:
+        return self._r
+
+    def get_trans(self) -> torch.Tensor:
+        return self._t
+
+    def compose(
+        self,
+        other: Frame,
+    ) -> Frame:
+        new_rot = self._r @ other._r
+        new_trans = self._r.apply(other._t) + self._t
+        return Frame(new_rot, new_trans)
+
+    def apply(
+        self,
+        pts: torch.Tensor,
+    ) -> torch.Tensor:
+        rotated = self._r.apply(pts)
+        return rotated + self._t
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        pts = pts - self._t
+        return self._r.invert_apply(pts)
+
+    def invert(self) -> Frame:
+        rot_inv = self._r.invert()
+        trn_inv = rot_inv.apply(self._t)
+
+        return Frame(rot_inv, -1 * trn_inv)
+
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor],
+                                         torch.Tensor]) -> Frame:
+        new_rots = self._r.map_tensor_fn(fn)
+        new_trans = torch.stack(
+            list(map(fn, torch.unbind(self._t, dim=-1))), dim=-1)
+
+        return Frame(new_rots, new_trans)
+
+    def to_tensor_4x4(self) -> torch.Tensor:
+        tensor = self._t.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._r.rot_mat
+        tensor[..., :3, 3] = self._t
+        tensor[..., 3, 3] = 1
+        return tensor
+
+    @staticmethod
+    def from_tensor_4x4(t: torch.Tensor) -> Frame:
+        if t.shape[-2:] != (4, 4):
+            raise ValueError('Incorrectly shaped input tensor')
+
+        rots = Rotation(mat=t[..., :3, :3])
+        trans = t[..., :3, 3]
+
+        return Frame(rots, trans)
+
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor,
+        origin: torch.Tensor,
+        p_xy_plane: torch.Tensor,
+        eps: float = 1e-8,
+    ) -> Frame:
+        p_neg_x_axis = torch.unbind(p_neg_x_axis, dim=-1)
+        origin = torch.unbind(origin, dim=-1)
+        p_xy_plane = torch.unbind(p_xy_plane, dim=-1)
+
+        e0 = [c1 - c2 for c1, c2 in zip(origin, p_neg_x_axis)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane, origin)]
+
+        denom = torch.sqrt(sum((c * c for c in e0)) + eps)
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum((c * c for c in e1)) + eps)
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+
+        rot_obj = Rotation(mat=rots)
+
+        return Frame(rot_obj, torch.stack(origin, dim=-1))
+
+    def unsqueeze(
+        self,
+        dim: int,
+    ) -> Frame:
+        if dim >= len(self.shape):
+            raise ValueError('Invalid dimension')
+        rots = self._r.unsqueeze(dim)
+        trans = self._t.unsqueeze(dim if dim >= 0 else dim - 1)
+
+        return Frame(rots, trans)
+
+    @staticmethod
+    def cat(
+        Ts: Sequence[Frame],
+        dim: int,
+    ) -> Frame:
+        rots = Rotation.cat([T._r for T in Ts], dim)
+        trans = torch.cat([T._t for T in Ts], dim=dim if dim >= 0 else dim - 1)
+
+        return Frame(rots, trans)
+
+    def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Frame:
+        return Frame(fn(self._r), self._t)
+
+    def apply_trans_fn(self, fn: Callable[[torch.Tensor],
+                                          torch.Tensor]) -> Frame:
+        return Frame(self._r, fn(self._t))
+
+    def scale_translation(self, trans_scale_factor: float) -> Frame:
+        # fn = lambda t: t * trans_scale_factor
+        def fn(t):
+            return t * trans_scale_factor
+
+        return self.apply_trans_fn(fn)
+
+    def stop_rot_gradient(self) -> Frame:
+        # fn = lambda r: r.detach()
+        def fn(r):
+            return r.detach()
+
+        return self.apply_rot_fn(fn)
+
+    @staticmethod
+    def make_transform_from_reference(n_xyz, ca_xyz, c_xyz, eps=1e-20):
+        input_dtype = ca_xyz.dtype
+        n_xyz = n_xyz.float()
+        ca_xyz = ca_xyz.float()
+        c_xyz = c_xyz.float()
+        n_xyz = n_xyz - ca_xyz
+        c_xyz = c_xyz - ca_xyz
+
+        c_x, c_y, d_pair = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x**2 + c_y**2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+
+        norm = torch.sqrt(eps + c_x**2 + c_y**2 + d_pair**2)
+        sin_c2 = d_pair / norm
+        cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm
+
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c2_rots[..., 2, 0] = -1 * sin_c2
+        c2_rots[..., 2, 2] = cos_c2
+
+        c_rots = Rotation.mat_mul_mat(c2_rots, c1_rots)
+        n_xyz = Rotation.mat_mul_vec(c_rots, n_xyz)
+
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y**2 + n_z**2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+
+        rots = Rotation.mat_mul_mat(n_rots, c_rots)
+
+        rots = rots.transpose(-1, -2)
+        rot_obj = Rotation(mat=rots.type(input_dtype))
+
+        return Frame(rot_obj, ca_xyz.type(input_dtype))
+
+    def cuda(self) -> Frame:
+        return Frame(self._r.cuda(), self._t.cuda())
+
+    @property
+    def dtype(self) -> torch.dtype:
+        assert self._r.dtype == self._t.dtype
+        return self._r.dtype
+
+    def type(self, dtype) -> Frame:
+        return Frame(self._r.type(dtype), self._t.type(dtype))
+
+
+class Quaternion:
+
+    def __init__(self, quaternion: torch.Tensor, translation: torch.Tensor):
+        if quaternion.shape[-1] != 4:
+            raise ValueError(f'incorrect quaternion shape: {quaternion.shape}')
+        self._q = quaternion
+        self._t = translation
+
+    @staticmethod
+    def identity(
+        shape: Iterable[int],
+        dtype: Optional[torch.dtype] = torch.float,
+        device: Optional[torch.device] = torch.device('cpu'),
+        requires_grad: bool = False,
+    ) -> Quaternion:
+        trans = zero_translation(shape, dtype, device, requires_grad)
+        quats = torch.zeros((*shape, 4),
+                            dtype=dtype,
+                            device=device,
+                            requires_grad=requires_grad)
+        with torch.no_grad():
+            quats[..., 0] = 1
+        return Quaternion(quats, trans)
+
+    def get_quats(self):
+        return self._q
+
+    def get_trans(self):
+        return self._t
+
+    def get_rot_mats(self):
+        quats = self.get_quats()
+        rot_mats = Quaternion.quat_to_rot(quats)
+        return rot_mats
+
+    @staticmethod
+    def quat_to_rot(normalized_quat):
+        global _QUAT_TO_ROT_tensor
+        dtype = normalized_quat.dtype
+        normalized_quat = normalized_quat.float()
+        if _QUAT_TO_ROT_tensor.device != normalized_quat.device:
+            _QUAT_TO_ROT_tensor = _QUAT_TO_ROT_tensor.to(
+                normalized_quat.device)
+        rot_tensor = torch.sum(
+            _QUAT_TO_ROT_tensor * normalized_quat[..., :, None, None]
+            * normalized_quat[..., None, :, None],
+            dim=(-3, -2),
+        )
+        rot_tensor = rot_tensor.type(dtype)
+        rot_tensor = rot_tensor.view(*rot_tensor.shape[:-1], 3, 3)
+        return rot_tensor
+
+    @staticmethod
+    def normalize_quat(quats):
+        dtype = quats.dtype
+        quats = quats.float()
+        quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+        quats = quats.type(dtype)
+        return quats
+
+    @staticmethod
+    def quat_multiply_by_vec(quat, vec):
+        dtype = quat.dtype
+        quat = quat.float()
+        vec = vec.float()
+        global _QUAT_MULTIPLY_BY_VEC_tensor
+        if _QUAT_MULTIPLY_BY_VEC_tensor.device != quat.device:
+            _QUAT_MULTIPLY_BY_VEC_tensor = _QUAT_MULTIPLY_BY_VEC_tensor.to(
+                quat.device)
+        mat = _QUAT_MULTIPLY_BY_VEC_tensor
+        reshaped_mat = mat.view((1, ) * len(quat.shape[:-1]) + mat.shape)
+        return torch.sum(
+            reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None],
+            dim=(-3, -2),
+        ).type(dtype)
+
+    def compose_q_update_vec(self,
+                             q_update_vec: torch.Tensor,
+                             normalize_quats: bool = True) -> torch.Tensor:
+        quats = self.get_quats()
+        new_quats = quats + Quaternion.quat_multiply_by_vec(
+            quats, q_update_vec)
+        if normalize_quats:
+            new_quats = Quaternion.normalize_quat(new_quats)
+        return new_quats
+
+    def compose_update_vec(
+        self,
+        update_vec: torch.Tensor,
+        pre_rot_mat: Rotation,
+    ) -> Quaternion:
+        q_vec, t_vec = update_vec[..., :3], update_vec[..., 3:]
+        new_quats = self.compose_q_update_vec(q_vec)
+
+        trans_update = pre_rot_mat.apply(t_vec)
+        new_trans = self._t + trans_update
+
+        return Quaternion(new_quats, new_trans)
+
+    def stop_rot_gradient(self) -> Quaternion:
+        return Quaternion(self._q.detach(), self._t)
diff --git a/modelscope/models/science/unifold/modules/structure_module.py b/modelscope/models/science/unifold/modules/structure_module.py
new file mode 100644
index 00000000..5d4da30b
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/structure_module.py
@@ -0,0 +1,592 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import math
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from unicore.modules import LayerNorm, softmax_dropout
+from unicore.utils import dict_multimap, one_hot, permute_final_dims
+
+from modelscope.models.science.unifold.data.residue_constants import (
+    restype_atom14_mask, restype_atom14_rigid_group_positions,
+    restype_atom14_to_rigid_group, restype_rigid_group_default_frame)
+from .attentions import gen_attn_mask
+from .common import Linear, SimpleModuleList, residual
+from .frame import Frame, Quaternion, Rotation
+
+
+def ipa_point_weights_init_(weights):
+    with torch.no_grad():
+        softplus_inverse_1 = 0.541324854612918
+        weights.fill_(softplus_inverse_1)
+
+
+def torsion_angles_to_frames(
+    frame: Frame,
+    alpha: torch.Tensor,
+    aatype: torch.Tensor,
+    default_frames: torch.Tensor,
+):
+    default_frame = Frame.from_tensor_4x4(default_frames[aatype, ...])
+
+    bb_rot = alpha.new_zeros((*((1, ) * len(alpha.shape[:-1])), 2))
+    bb_rot[..., 1] = 1
+
+    alpha = torch.cat([bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha],
+                      dim=-2)
+
+    all_rots = alpha.new_zeros(default_frame.get_rots().rot_mat.shape)
+    all_rots[..., 0, 0] = 1
+    all_rots[..., 1, 1] = alpha[..., 1]
+    all_rots[..., 1, 2] = -alpha[..., 0]
+    all_rots[..., 2, 1:] = alpha
+
+    all_rots = Frame(Rotation(mat=all_rots), None)
+
+    all_frames = default_frame.compose(all_rots)
+
+    chi2_frame_to_frame = all_frames[..., 5]
+    chi3_frame_to_frame = all_frames[..., 6]
+    chi4_frame_to_frame = all_frames[..., 7]
+
+    chi1_frame_to_bb = all_frames[..., 4]
+    chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
+    chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
+    chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
+
+    all_frames_to_bb = Frame.cat(
+        [
+            all_frames[..., :5],
+            chi2_frame_to_bb.unsqueeze(-1),
+            chi3_frame_to_bb.unsqueeze(-1),
+            chi4_frame_to_bb.unsqueeze(-1),
+        ],
+        dim=-1,
+    )
+
+    all_frames_to_global = frame[..., None].compose(all_frames_to_bb)
+
+    return all_frames_to_global
+
+
+def frames_and_literature_positions_to_atom14_pos(
+    frame: Frame,
+    aatype: torch.Tensor,
+    default_frames,
+    group_idx,
+    atom_mask,
+    lit_positions,
+):
+    group_mask = group_idx[aatype, ...]
+    group_mask = one_hot(
+        group_mask,
+        num_classes=default_frames.shape[-3],
+    )
+
+    t_atoms_to_global = frame[..., None, :] * group_mask
+    t_atoms_to_global = t_atoms_to_global.map_tensor_fn(
+        lambda x: torch.sum(x, dim=-1))
+
+    atom_mask = atom_mask[aatype, ...].unsqueeze(-1)
+
+    lit_positions = lit_positions[aatype, ...]
+    pred_positions = t_atoms_to_global.apply(lit_positions)
+    pred_positions = pred_positions * atom_mask
+
+    return pred_positions
+
+
+class SideChainAngleResnetIteration(nn.Module):
+
+    def __init__(self, d_hid):
+        super(SideChainAngleResnetIteration, self).__init__()
+
+        self.d_hid = d_hid
+
+        self.linear_1 = Linear(self.d_hid, self.d_hid, init='relu')
+        self.act = nn.GELU()
+        self.linear_2 = Linear(self.d_hid, self.d_hid, init='final')
+
+    def forward(self, s: torch.Tensor) -> torch.Tensor:
+
+        x = self.act(s)
+        x = self.linear_1(x)
+        x = self.act(x)
+        x = self.linear_2(x)
+
+        return residual(s, x, self.training)
+
+
+class SidechainAngleResnet(nn.Module):
+
+    def __init__(self, d_in, d_hid, num_blocks, num_angles):
+        super(SidechainAngleResnet, self).__init__()
+
+        self.linear_in = Linear(d_in, d_hid)
+        self.act = nn.GELU()
+        self.linear_initial = Linear(d_in, d_hid)
+
+        self.layers = SimpleModuleList()
+        for _ in range(num_blocks):
+            self.layers.append(SideChainAngleResnetIteration(d_hid=d_hid))
+
+        self.linear_out = Linear(d_hid, num_angles * 2)
+
+    def forward(self, s: torch.Tensor,
+                initial_s: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        initial_s = self.linear_initial(self.act(initial_s))
+        s = self.linear_in(self.act(s))
+
+        s = s + initial_s
+
+        for layer in self.layers:
+            s = layer(s)
+
+        s = self.linear_out(self.act(s))
+
+        s = s.view(s.shape[:-1] + (-1, 2))
+
+        unnormalized_s = s
+        norm_denom = torch.sqrt(
+            torch.clamp(
+                torch.sum(s.float()**2, dim=-1, keepdim=True),
+                min=1e-12,
+            ))
+        s = s.float() / norm_denom
+
+        return unnormalized_s, s.type(unnormalized_s.dtype)
+
+
+class InvariantPointAttention(nn.Module):
+
+    def __init__(
+        self,
+        d_single: int,
+        d_pair: int,
+        d_hid: int,
+        num_heads: int,
+        num_qk_points: int,
+        num_v_points: int,
+        separate_kv: bool = False,
+        bias: bool = True,
+        eps: float = 1e-8,
+    ):
+        super(InvariantPointAttention, self).__init__()
+
+        self.d_hid = d_hid
+        self.num_heads = num_heads
+        self.num_qk_points = num_qk_points
+        self.num_v_points = num_v_points
+        self.eps = eps
+
+        hc = self.d_hid * self.num_heads
+        self.linear_q = Linear(d_single, hc, bias=bias)
+        self.separate_kv = separate_kv
+        if self.separate_kv:
+            self.linear_k = Linear(d_single, hc, bias=bias)
+            self.linear_v = Linear(d_single, hc, bias=bias)
+        else:
+            self.linear_kv = Linear(d_single, 2 * hc, bias=bias)
+
+        hpq = self.num_heads * self.num_qk_points * 3
+        self.linear_q_points = Linear(d_single, hpq)
+        hpk = self.num_heads * self.num_qk_points * 3
+        hpv = self.num_heads * self.num_v_points * 3
+        if self.separate_kv:
+            self.linear_k_points = Linear(d_single, hpk)
+            self.linear_v_points = Linear(d_single, hpv)
+        else:
+            hpkv = hpk + hpv
+            self.linear_kv_points = Linear(d_single, hpkv)
+
+        self.linear_b = Linear(d_pair, self.num_heads)
+
+        self.head_weights = nn.Parameter(torch.zeros((num_heads)))
+        ipa_point_weights_init_(self.head_weights)
+
+        concat_out_dim = self.num_heads * (
+            d_pair + self.d_hid + self.num_v_points * 4)
+        self.linear_out = Linear(concat_out_dim, d_single, init='final')
+
+        self.softplus = nn.Softplus()
+
+    def forward(
+        self,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        f: Frame,
+        square_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        q = self.linear_q(s)
+
+        q = q.view(q.shape[:-1] + (self.num_heads, -1))
+
+        if self.separate_kv:
+            k = self.linear_k(s)
+            v = self.linear_v(s)
+            k = k.view(k.shape[:-1] + (self.num_heads, -1))
+            v = v.view(v.shape[:-1] + (self.num_heads, -1))
+        else:
+            kv = self.linear_kv(s)
+            kv = kv.view(kv.shape[:-1] + (self.num_heads, -1))
+            k, v = torch.split(kv, self.d_hid, dim=-1)
+
+        q_pts = self.linear_q_points(s)
+
+        def process_points(pts, no_points):
+            shape = pts.shape[:-1] + (pts.shape[-1] // 3, 3)
+            if self.separate_kv:
+                # alphafold-multimer uses different layout
+                pts = pts.view(pts.shape[:-1]
+                               + (self.num_heads, no_points * 3))
+            pts = torch.split(pts, pts.shape[-1] // 3, dim=-1)
+            pts = torch.stack(pts, dim=-1).view(*shape)
+            pts = f[..., None].apply(pts)
+
+            pts = pts.view(pts.shape[:-2] + (self.num_heads, no_points, 3))
+            return pts
+
+        q_pts = process_points(q_pts, self.num_qk_points)
+
+        if self.separate_kv:
+            k_pts = self.linear_k_points(s)
+            v_pts = self.linear_v_points(s)
+            k_pts = process_points(k_pts, self.num_qk_points)
+            v_pts = process_points(v_pts, self.num_v_points)
+        else:
+            kv_pts = self.linear_kv_points(s)
+
+            kv_pts = torch.split(kv_pts, kv_pts.shape[-1] // 3, dim=-1)
+            kv_pts = torch.stack(kv_pts, dim=-1)
+            kv_pts = f[..., None].apply(kv_pts)
+
+            kv_pts = kv_pts.view(kv_pts.shape[:-2] + (self.num_heads, -1, 3))
+
+            k_pts, v_pts = torch.split(
+                kv_pts, [self.num_qk_points, self.num_v_points], dim=-2)
+
+        bias = self.linear_b(z)
+
+        attn = torch.matmul(
+            permute_final_dims(q, (1, 0, 2)),
+            permute_final_dims(k, (1, 2, 0)),
+        )
+
+        if self.training:
+            attn = attn * math.sqrt(1.0 / (3 * self.d_hid))
+            attn = attn + (
+                math.sqrt(1.0 / 3) * permute_final_dims(bias, (2, 0, 1)))
+            pt_att = q_pts.unsqueeze(-4) - k_pts.unsqueeze(-5)
+            pt_att = pt_att.float()**2
+        else:
+            attn *= math.sqrt(1.0 / (3 * self.d_hid))
+            attn += (math.sqrt(1.0 / 3) * permute_final_dims(bias, (2, 0, 1)))
+            pt_att = q_pts.unsqueeze(-4) - k_pts.unsqueeze(-5)
+            pt_att *= pt_att
+
+        pt_att = pt_att.sum(dim=-1)
+        head_weights = self.softplus(self.head_weights).view(
+            *((1, ) * len(pt_att.shape[:-2]) + (-1, 1)))
+        head_weights = head_weights * math.sqrt(
+            1.0 / (3 * (self.num_qk_points * 9.0 / 2)))
+        pt_att *= head_weights * (-0.5)
+
+        pt_att = torch.sum(pt_att, dim=-1)
+
+        pt_att = permute_final_dims(pt_att, (2, 0, 1))
+        attn += square_mask
+        attn = softmax_dropout(
+            attn, 0, self.training, bias=pt_att.type(attn.dtype))
+        del pt_att, q_pts, k_pts, bias
+        o = torch.matmul(attn, v.transpose(-2, -3)).transpose(-2, -3)
+        o = o.contiguous().view(*o.shape[:-2], -1)
+        del q, k, v
+        o_pts = torch.sum(
+            (attn[..., None, :, :, None]
+             * permute_final_dims(v_pts, (1, 3, 0, 2))[..., None, :, :]),
+            dim=-2,
+        )
+
+        o_pts = permute_final_dims(o_pts, (2, 0, 3, 1))
+        o_pts = f[..., None, None].invert_apply(o_pts)
+        if self.training:
+            o_pts_norm = torch.sqrt(
+                torch.sum(o_pts.float()**2, dim=-1) + self.eps).type(
+                    o_pts.dtype)
+        else:
+            o_pts_norm = torch.sqrt(torch.sum(o_pts**2, dim=-1)
+                                    + self.eps).type(o_pts.dtype)
+
+        o_pts_norm = o_pts_norm.view(*o_pts_norm.shape[:-2], -1)
+
+        o_pts = o_pts.view(*o_pts.shape[:-3], -1, 3)
+
+        o_pair = torch.matmul(attn.transpose(-2, -3), z)
+
+        o_pair = o_pair.view(*o_pair.shape[:-2], -1)
+
+        s = self.linear_out(
+            torch.cat((o, *torch.unbind(o_pts, dim=-1), o_pts_norm, o_pair),
+                      dim=-1))
+
+        return s
+
+
+class BackboneUpdate(nn.Module):
+
+    def __init__(self, d_single):
+        super(BackboneUpdate, self).__init__()
+        self.linear = Linear(d_single, 6, init='final')
+
+    def forward(self, s: torch.Tensor):
+        return self.linear(s)
+
+
+class StructureModuleTransitionLayer(nn.Module):
+
+    def __init__(self, c):
+        super(StructureModuleTransitionLayer, self).__init__()
+
+        self.linear_1 = Linear(c, c, init='relu')
+        self.linear_2 = Linear(c, c, init='relu')
+        self.act = nn.GELU()
+        self.linear_3 = Linear(c, c, init='final')
+
+    def forward(self, s):
+        s_old = s
+        s = self.linear_1(s)
+        s = self.act(s)
+        s = self.linear_2(s)
+        s = self.act(s)
+        s = self.linear_3(s)
+
+        s = residual(s_old, s, self.training)
+
+        return s
+
+
+class StructureModuleTransition(nn.Module):
+
+    def __init__(self, c, num_layers, dropout_rate):
+        super(StructureModuleTransition, self).__init__()
+
+        self.num_layers = num_layers
+        self.dropout_rate = dropout_rate
+
+        self.layers = SimpleModuleList()
+        for _ in range(self.num_layers):
+            self.layers.append(StructureModuleTransitionLayer(c))
+
+        self.dropout = nn.Dropout(self.dropout_rate)
+        self.layer_norm = LayerNorm(c)
+
+    def forward(self, s):
+        for layer in self.layers:
+            s = layer(s)
+
+        s = self.dropout(s)
+        s = self.layer_norm(s)
+
+        return s
+
+
+class StructureModule(nn.Module):
+
+    def __init__(
+        self,
+        d_single,
+        d_pair,
+        d_ipa,
+        d_angle,
+        num_heads_ipa,
+        num_qk_points,
+        num_v_points,
+        dropout_rate,
+        num_blocks,
+        no_transition_layers,
+        num_resnet_blocks,
+        num_angles,
+        trans_scale_factor,
+        separate_kv,
+        ipa_bias,
+        epsilon,
+        inf,
+        **kwargs,
+    ):
+        super(StructureModule, self).__init__()
+
+        self.num_blocks = num_blocks
+        self.trans_scale_factor = trans_scale_factor
+        self.default_frames = None
+        self.group_idx = None
+        self.atom_mask = None
+        self.lit_positions = None
+        self.inf = inf
+
+        self.layer_norm_s = LayerNorm(d_single)
+        self.layer_norm_z = LayerNorm(d_pair)
+
+        self.linear_in = Linear(d_single, d_single)
+
+        self.ipa = InvariantPointAttention(
+            d_single,
+            d_pair,
+            d_ipa,
+            num_heads_ipa,
+            num_qk_points,
+            num_v_points,
+            separate_kv=separate_kv,
+            bias=ipa_bias,
+            eps=epsilon,
+        )
+
+        self.ipa_dropout = nn.Dropout(dropout_rate)
+        self.layer_norm_ipa = LayerNorm(d_single)
+
+        self.transition = StructureModuleTransition(
+            d_single,
+            no_transition_layers,
+            dropout_rate,
+        )
+
+        self.bb_update = BackboneUpdate(d_single)
+
+        self.angle_resnet = SidechainAngleResnet(
+            d_single,
+            d_angle,
+            num_resnet_blocks,
+            num_angles,
+        )
+
+    def forward(
+        self,
+        s,
+        z,
+        aatype,
+        mask=None,
+    ):
+        if mask is None:
+            mask = s.new_ones(s.shape[:-1])
+
+        # generate square mask
+        square_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2)
+        square_mask = gen_attn_mask(square_mask, -self.inf).unsqueeze(-3)
+        s = self.layer_norm_s(s)
+        z = self.layer_norm_z(z)
+        initial_s = s
+        s = self.linear_in(s)
+
+        quat_encoder = Quaternion.identity(
+            s.shape[:-1],
+            s.dtype,
+            s.device,
+            requires_grad=False,
+        )
+        backb_to_global = Frame(
+            Rotation(mat=quat_encoder.get_rot_mats(), ),
+            quat_encoder.get_trans(),
+        )
+        outputs = []
+        for i in range(self.num_blocks):
+            s = residual(s, self.ipa(s, z, backb_to_global, square_mask),
+                         self.training)
+            s = self.ipa_dropout(s)
+            s = self.layer_norm_ipa(s)
+            s = self.transition(s)
+
+            # update quaternion encoder
+            # use backb_to_global to avoid quat-to-rot conversion
+            quat_encoder = quat_encoder.compose_update_vec(
+                self.bb_update(s), pre_rot_mat=backb_to_global.get_rots())
+
+            # initial_s is always used to update the backbone
+            unnormalized_angles, angles = self.angle_resnet(s, initial_s)
+
+            # convert quaternion to rotation matrix
+            backb_to_global = Frame(
+                Rotation(mat=quat_encoder.get_rot_mats(), ),
+                quat_encoder.get_trans(),
+            )
+            if i == self.num_blocks - 1:
+                all_frames_to_global = self.torsion_angles_to_frames(
+                    backb_to_global.scale_translation(self.trans_scale_factor),
+                    angles,
+                    aatype,
+                )
+
+                pred_positions = self.frames_and_literature_positions_to_atom14_pos(
+                    all_frames_to_global,
+                    aatype,
+                )
+
+            preds = {
+                'frames':
+                backb_to_global.scale_translation(
+                    self.trans_scale_factor).to_tensor_4x4(),
+                'unnormalized_angles':
+                unnormalized_angles,
+                'angles':
+                angles,
+            }
+
+            outputs.append(preds)
+            if i < (self.num_blocks - 1):
+                # stop gradient in iteration
+                quat_encoder = quat_encoder.stop_rot_gradient()
+                backb_to_global = backb_to_global.stop_rot_gradient()
+
+        outputs = dict_multimap(torch.stack, outputs)
+        outputs['sidechain_frames'] = all_frames_to_global.to_tensor_4x4()
+        outputs['positions'] = pred_positions
+        outputs['single'] = s
+
+        return outputs
+
+    def _init_residue_constants(self, float_dtype, device):
+        if self.default_frames is None:
+            self.default_frames = torch.tensor(
+                restype_rigid_group_default_frame,
+                dtype=float_dtype,
+                device=device,
+                requires_grad=False,
+            )
+        if self.group_idx is None:
+            self.group_idx = torch.tensor(
+                restype_atom14_to_rigid_group,
+                device=device,
+                requires_grad=False,
+            )
+        if self.atom_mask is None:
+            self.atom_mask = torch.tensor(
+                restype_atom14_mask,
+                dtype=float_dtype,
+                device=device,
+                requires_grad=False,
+            )
+        if self.lit_positions is None:
+            self.lit_positions = torch.tensor(
+                restype_atom14_rigid_group_positions,
+                dtype=float_dtype,
+                device=device,
+                requires_grad=False,
+            )
+
+    def torsion_angles_to_frames(self, frame, alpha, aatype):
+        self._init_residue_constants(alpha.dtype, alpha.device)
+        return torsion_angles_to_frames(frame, alpha, aatype,
+                                        self.default_frames)
+
+    def frames_and_literature_positions_to_atom14_pos(self, frame, aatype):
+        self._init_residue_constants(frame.get_rots().dtype,
+                                     frame.get_rots().device)
+        return frames_and_literature_positions_to_atom14_pos(
+            frame,
+            aatype,
+            self.default_frames,
+            self.group_idx,
+            self.atom_mask,
+            self.lit_positions,
+        )
diff --git a/modelscope/models/science/unifold/modules/template.py b/modelscope/models/science/unifold/modules/template.py
new file mode 100644
index 00000000..49e5bec0
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/template.py
@@ -0,0 +1,330 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import math
+from functools import partial
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from unicore.modules import LayerNorm
+from unicore.utils import (checkpoint_sequential, permute_final_dims,
+                           tensor_tree_map)
+
+from .attentions import (Attention, TriangleAttentionEnding,
+                         TriangleAttentionStarting, gen_attn_mask)
+from .common import (Linear, SimpleModuleList, Transition,
+                     bias_dropout_residual, chunk_layer, residual,
+                     tri_mul_residual)
+from .featurization import build_template_pair_feat_v2
+from .triangle_multiplication import (TriangleMultiplicationIncoming,
+                                      TriangleMultiplicationOutgoing)
+
+
+class TemplatePointwiseAttention(nn.Module):
+
+    def __init__(self, d_template, d_pair, d_hid, num_heads, inf, **kwargs):
+        super(TemplatePointwiseAttention, self).__init__()
+
+        self.inf = inf
+
+        self.mha = Attention(
+            d_pair,
+            d_template,
+            d_template,
+            d_hid,
+            num_heads,
+            gating=False,
+        )
+
+    def _chunk(
+        self,
+        z: torch.Tensor,
+        t: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_inputs = {
+            'q': z,
+            'k': t,
+            'v': t,
+            'mask': mask,
+        }
+        return chunk_layer(
+            self.mha,
+            mha_inputs,
+            chunk_size=chunk_size,
+            num_batch_dims=len(z.shape[:-2]),
+        )
+
+    def forward(
+        self,
+        t: torch.Tensor,
+        z: torch.Tensor,
+        template_mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        if template_mask is None:
+            template_mask = t.new_ones(t.shape[:-3])
+
+        mask = gen_attn_mask(template_mask, -self.inf)[..., None, None, None,
+                                                       None, :]
+        z = z.unsqueeze(-2)
+
+        t = permute_final_dims(t, (1, 2, 0, 3))
+
+        if chunk_size is not None:
+            z = self._chunk(z, t, mask, chunk_size)
+        else:
+            z = self.mha(z, t, t, mask=mask)
+
+        z = z.squeeze(-2)
+
+        return z
+
+
+class TemplateProjection(nn.Module):
+
+    def __init__(self, d_template, d_pair, **kwargs):
+        super(TemplateProjection, self).__init__()
+
+        self.d_pair = d_pair
+        self.act = nn.ReLU()
+        self.output_linear = Linear(d_template, d_pair, init='relu')
+
+    def forward(self, t, z) -> torch.Tensor:
+        if t is None:
+            # handle for non-template case
+            shape = z.shape
+            shape[-1] = self.d_pair
+            t = torch.zeros(shape, dtype=z.dtype, device=z.device)
+        t = self.act(t)
+        z_t = self.output_linear(t)
+        return z_t
+
+
+class TemplatePairStackBlock(nn.Module):
+
+    def __init__(
+        self,
+        d_template: int,
+        d_hid_tri_att: int,
+        d_hid_tri_mul: int,
+        num_heads: int,
+        pair_transition_n: int,
+        dropout_rate: float,
+        tri_attn_first: bool,
+        inf: float,
+        **kwargs,
+    ):
+        super(TemplatePairStackBlock, self).__init__()
+
+        self.tri_att_start = TriangleAttentionStarting(
+            d_template,
+            d_hid_tri_att,
+            num_heads,
+        )
+        self.tri_att_end = TriangleAttentionEnding(
+            d_template,
+            d_hid_tri_att,
+            num_heads,
+        )
+
+        self.tri_mul_out = TriangleMultiplicationOutgoing(
+            d_template,
+            d_hid_tri_mul,
+        )
+        self.tri_mul_in = TriangleMultiplicationIncoming(
+            d_template,
+            d_hid_tri_mul,
+        )
+
+        self.pair_transition = Transition(
+            d_template,
+            pair_transition_n,
+        )
+        self.tri_attn_first = tri_attn_first
+        self.dropout = dropout_rate
+        self.row_dropout_share_dim = -3
+        self.col_dropout_share_dim = -2
+
+    def forward(
+        self,
+        s: torch.Tensor,
+        mask: torch.Tensor,
+        tri_start_attn_mask: torch.Tensor,
+        tri_end_attn_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        block_size: Optional[int] = None,
+    ):
+        if self.tri_attn_first:
+            s = bias_dropout_residual(
+                self.tri_att_start,
+                s,
+                self.tri_att_start(
+                    s, attn_mask=tri_start_attn_mask, chunk_size=chunk_size),
+                self.row_dropout_share_dim,
+                self.dropout,
+                self.training,
+            )
+
+            s = bias_dropout_residual(
+                self.tri_att_end,
+                s,
+                self.tri_att_end(
+                    s, attn_mask=tri_end_attn_mask, chunk_size=chunk_size),
+                self.col_dropout_share_dim,
+                self.dropout,
+                self.training,
+            )
+            s = tri_mul_residual(
+                self.tri_mul_out,
+                s,
+                self.tri_mul_out(s, mask=mask, block_size=block_size),
+                self.row_dropout_share_dim,
+                self.dropout,
+                self.training,
+                block_size=block_size,
+            )
+
+            s = tri_mul_residual(
+                self.tri_mul_in,
+                s,
+                self.tri_mul_in(s, mask=mask, block_size=block_size),
+                self.row_dropout_share_dim,
+                self.dropout,
+                self.training,
+                block_size=block_size,
+            )
+        else:
+            s = tri_mul_residual(
+                self.tri_mul_out,
+                s,
+                self.tri_mul_out(s, mask=mask, block_size=block_size),
+                self.row_dropout_share_dim,
+                self.dropout,
+                self.training,
+                block_size=block_size,
+            )
+
+            s = tri_mul_residual(
+                self.tri_mul_in,
+                s,
+                self.tri_mul_in(s, mask=mask, block_size=block_size),
+                self.row_dropout_share_dim,
+                self.dropout,
+                self.training,
+                block_size=block_size,
+            )
+
+            s = bias_dropout_residual(
+                self.tri_att_start,
+                s,
+                self.tri_att_start(
+                    s, attn_mask=tri_start_attn_mask, chunk_size=chunk_size),
+                self.row_dropout_share_dim,
+                self.dropout,
+                self.training,
+            )
+
+            s = bias_dropout_residual(
+                self.tri_att_end,
+                s,
+                self.tri_att_end(
+                    s, attn_mask=tri_end_attn_mask, chunk_size=chunk_size),
+                self.col_dropout_share_dim,
+                self.dropout,
+                self.training,
+            )
+        s = residual(s, self.pair_transition(
+            s,
+            chunk_size=chunk_size,
+        ), self.training)
+        return s
+
+
+class TemplatePairStack(nn.Module):
+
+    def __init__(
+        self,
+        d_template,
+        d_hid_tri_att,
+        d_hid_tri_mul,
+        num_blocks,
+        num_heads,
+        pair_transition_n,
+        dropout_rate,
+        tri_attn_first,
+        inf=1e9,
+        **kwargs,
+    ):
+        super(TemplatePairStack, self).__init__()
+
+        self.blocks = SimpleModuleList()
+        for _ in range(num_blocks):
+            self.blocks.append(
+                TemplatePairStackBlock(
+                    d_template=d_template,
+                    d_hid_tri_att=d_hid_tri_att,
+                    d_hid_tri_mul=d_hid_tri_mul,
+                    num_heads=num_heads,
+                    pair_transition_n=pair_transition_n,
+                    dropout_rate=dropout_rate,
+                    inf=inf,
+                    tri_attn_first=tri_attn_first,
+                ))
+
+        self.layer_norm = LayerNorm(d_template)
+
+    def forward(
+        self,
+        single_templates: Tuple[torch.Tensor],
+        mask: torch.tensor,
+        tri_start_attn_mask: torch.Tensor,
+        tri_end_attn_mask: torch.Tensor,
+        templ_dim: int,
+        chunk_size: int,
+        block_size: int,
+        return_mean: bool,
+    ):
+
+        def one_template(i):
+            (s, ) = checkpoint_sequential(
+                functions=[
+                    partial(
+                        b,
+                        mask=mask,
+                        tri_start_attn_mask=tri_start_attn_mask,
+                        tri_end_attn_mask=tri_end_attn_mask,
+                        chunk_size=chunk_size,
+                        block_size=block_size,
+                    ) for b in self.blocks
+                ],
+                input=(single_templates[i], ),
+            )
+            return s
+
+        n_templ = len(single_templates)
+        if n_templ > 0:
+            new_single_templates = [one_template(0)]
+            if return_mean:
+                t = self.layer_norm(new_single_templates[0])
+            for i in range(1, n_templ):
+                s = one_template(i)
+                if return_mean:
+                    t = residual(t, self.layer_norm(s), self.training)
+                else:
+                    new_single_templates.append(s)
+
+        if return_mean:
+            if n_templ > 0:
+                t /= n_templ
+            else:
+                t = None
+        else:
+            t = torch.cat(
+                [s.unsqueeze(templ_dim) for s in new_single_templates],
+                dim=templ_dim)
+            t = self.layer_norm(t)
+
+        return t
diff --git a/modelscope/models/science/unifold/modules/triangle_multiplication.py b/modelscope/models/science/unifold/modules/triangle_multiplication.py
new file mode 100644
index 00000000..c4094cd2
--- /dev/null
+++ b/modelscope/models/science/unifold/modules/triangle_multiplication.py
@@ -0,0 +1,158 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+from functools import partialmethod
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from unicore.modules import LayerNorm
+from unicore.utils import permute_final_dims
+
+from .common import Linear
+
+
+class TriangleMultiplication(nn.Module):
+
+    def __init__(self, d_pair, d_hid, outgoing=True):
+        super(TriangleMultiplication, self).__init__()
+        self.outgoing = outgoing
+
+        self.linear_ab_p = Linear(d_pair, d_hid * 2)
+        self.linear_ab_g = Linear(d_pair, d_hid * 2, init='gating')
+
+        self.linear_g = Linear(d_pair, d_pair, init='gating')
+        self.linear_z = Linear(d_hid, d_pair, init='final')
+
+        self.layer_norm_in = LayerNorm(d_pair)
+        self.layer_norm_out = LayerNorm(d_hid)
+
+        self._alphafold_original_mode = False
+
+    def _chunk_2d(
+        self,
+        z: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        block_size: int = None,
+    ) -> torch.Tensor:
+
+        # avoid too small chunk size
+        # block_size = max(block_size, 256)
+        new_z = z.new_zeros(z.shape)
+        dim1 = z.shape[-3]
+
+        def _slice_linear(z, linear: Linear, a=True):
+            d_hid = linear.bias.shape[0] // 2
+            index = 0 if a else d_hid
+            p = (
+                nn.functional.linear(z, linear.weight[index:index + d_hid])
+                + linear.bias[index:index + d_hid])
+            return p
+
+        def _chunk_projection(z, mask, a=True):
+            p = _slice_linear(z, self.linear_ab_p, a) * mask
+            p *= torch.sigmoid(_slice_linear(z, self.linear_ab_g, a))
+            return p
+
+        num_chunk = (dim1 + block_size - 1) // block_size
+        for i in range(num_chunk):
+            chunk_start = i * block_size
+            chunk_end = min(chunk_start + block_size, dim1)
+            if self.outgoing:
+                a_chunk = _chunk_projection(
+                    z[..., chunk_start:chunk_end, :, :],
+                    mask[..., chunk_start:chunk_end, :, :],
+                    a=True,
+                )
+                a_chunk = permute_final_dims(a_chunk, (2, 0, 1))
+            else:
+                a_chunk = _chunk_projection(
+                    z[..., :, chunk_start:chunk_end, :],
+                    mask[..., :, chunk_start:chunk_end, :],
+                    a=True,
+                )
+                a_chunk = a_chunk.transpose(-1, -3)
+
+            for j in range(num_chunk):
+                j_chunk_start = j * block_size
+                j_chunk_end = min(j_chunk_start + block_size, dim1)
+                if self.outgoing:
+                    b_chunk = _chunk_projection(
+                        z[..., j_chunk_start:j_chunk_end, :, :],
+                        mask[..., j_chunk_start:j_chunk_end, :, :],
+                        a=False,
+                    )
+                    b_chunk = b_chunk.transpose(-1, -3)
+                else:
+                    b_chunk = _chunk_projection(
+                        z[..., :, j_chunk_start:j_chunk_end, :],
+                        mask[..., :, j_chunk_start:j_chunk_end, :],
+                        a=False,
+                    )
+                    b_chunk = permute_final_dims(b_chunk, (2, 0, 1))
+                x_chunk = torch.matmul(a_chunk, b_chunk)
+                del b_chunk
+                x_chunk = permute_final_dims(x_chunk, (1, 2, 0))
+                x_chunk = self.layer_norm_out(x_chunk)
+                x_chunk = self.linear_z(x_chunk)
+                x_chunk *= torch.sigmoid(
+                    self.linear_g(z[..., chunk_start:chunk_end,
+                                    j_chunk_start:j_chunk_end, :]))
+                new_z[..., chunk_start:chunk_end,
+                      j_chunk_start:j_chunk_end, :] = x_chunk
+                del x_chunk
+            del a_chunk
+        return new_z
+
+    def forward(
+        self,
+        z: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        block_size=None,
+    ) -> torch.Tensor:
+
+        mask = mask.unsqueeze(-1)
+        if not self._alphafold_original_mode:
+            # divided by 1/sqrt(dim) for numerical stability
+            mask = mask * (mask.shape[-2]**-0.5)
+
+        z = self.layer_norm_in(z)
+        if not self.training and block_size is not None:
+            return self._chunk_2d(z, mask, block_size=block_size)
+
+        g = nn.functional.linear(z, self.linear_g.weight)
+        if self.training:
+            ab = self.linear_ab_p(z) * mask * torch.sigmoid(
+                self.linear_ab_g(z))
+        else:
+            ab = self.linear_ab_p(z)
+            ab *= mask
+            ab *= torch.sigmoid(self.linear_ab_g(z))
+        a, b = torch.chunk(ab, 2, dim=-1)
+        del z, ab
+
+        if self.outgoing:
+            a = permute_final_dims(a, (2, 0, 1))
+            b = b.transpose(-1, -3)
+        else:
+            b = permute_final_dims(b, (2, 0, 1))
+            a = a.transpose(-1, -3)
+        x = torch.matmul(a, b)
+        del a, b
+
+        x = permute_final_dims(x, (1, 2, 0))
+
+        x = self.layer_norm_out(x)
+        x = nn.functional.linear(x, self.linear_z.weight)
+        return x, g
+
+    def get_output_bias(self):
+        return self.linear_z.bias, self.linear_g.bias
+
+
+class TriangleMultiplicationOutgoing(TriangleMultiplication):
+    __init__ = partialmethod(TriangleMultiplication.__init__, outgoing=True)
+
+
+class TriangleMultiplicationIncoming(TriangleMultiplication):
+    __init__ = partialmethod(TriangleMultiplication.__init__, outgoing=False)
diff --git a/modelscope/models/science/unifold/msa/__init__.py b/modelscope/models/science/unifold/msa/__init__.py
new file mode 100644
index 00000000..2121062c
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/__init__.py
@@ -0,0 +1 @@
+""" Scripts for MSA & template searching. """
diff --git a/modelscope/models/science/unifold/msa/mmcif.py b/modelscope/models/science/unifold/msa/mmcif.py
new file mode 100644
index 00000000..cf67239f
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/mmcif.py
@@ -0,0 +1,483 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parses the mmCIF file format."""
+import collections
+import dataclasses
+import functools
+import io
+from typing import Any, Mapping, Optional, Sequence, Tuple
+
+from absl import logging
+from Bio import PDB
+from Bio.Data import SCOPData
+from Bio.PDB.MMCIFParser import MMCIFParser
+
+# Type aliases:
+ChainId = str
+PdbHeader = Mapping[str, Any]
+PdbStructure = PDB.Structure.Structure
+SeqRes = str
+MmCIFDict = Mapping[str, Sequence[str]]
+
+
+@dataclasses.dataclass(frozen=True)
+class Monomer:
+    id: str
+    num: int
+
+
+# Note - mmCIF format provides no guarantees on the type of author-assigned
+# sequence numbers. They need not be integers.
+@dataclasses.dataclass(frozen=True)
+class AtomSite:
+    residue_name: str
+    author_chain_id: str
+    mmcif_chain_id: str
+    author_seq_num: str
+    mmcif_seq_num: int
+    insertion_code: str
+    hetatm_atom: str
+    model_num: int
+
+
+# Used to map SEQRES index to a residue in the structure.
+@dataclasses.dataclass(frozen=True)
+class ResiduePosition:
+    chain_id: str
+    residue_number: int
+    insertion_code: str
+
+
+@dataclasses.dataclass(frozen=True)
+class ResidueAtPosition:
+    position: Optional[ResiduePosition]
+    name: str
+    is_missing: bool
+    hetflag: str
+
+
+@dataclasses.dataclass(frozen=True)
+class MmcifObject:
+    """Representation of a parsed mmCIF file.
+
+    Contains:
+        file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
+            files being processed.
+        header: Biopython header.
+        structure: Biopython structure.
+        chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
+            {'A': 'ABCDEFG'}
+        seqres_to_structure: Dict; for each chain_id contains a mapping between
+            SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,  1: ResidueAtPosition, ...}}
+        raw_string: The raw string used to construct the MmcifObject.
+    """
+
+    file_id: str
+    header: PdbHeader
+    structure: PdbStructure
+    chain_to_seqres: Mapping[ChainId, SeqRes]
+    seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
+    raw_string: Any
+    mmcif_to_author_chain_id: Mapping[ChainId, ChainId]
+    valid_chains: Mapping[ChainId, str]
+
+
+@dataclasses.dataclass(frozen=True)
+class ParsingResult:
+    """Returned by the parse function.
+
+    Contains:
+        mmcif_object: A MmcifObject, may be None if no chain could be successfully
+            parsed.
+        errors: A dict mapping (file_id, chain_id) to any exception generated.
+    """
+
+    mmcif_object: Optional[MmcifObject]
+    errors: Mapping[Tuple[str, str], Any]
+
+
+class ParseError(Exception):
+    """An error indicating that an mmCIF file could not be parsed."""
+
+
+def mmcif_loop_to_list(prefix: str,
+                       parsed_info: MmCIFDict) -> Sequence[Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a list.
+
+    Reference for loop_ in mmCIF:
+        http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
+
+    Args:
+        prefix: Prefix shared by each of the data items in the loop.
+            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+            _entity_poly_seq.mon_id. Should include the trailing period.
+        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+            parser.
+
+    Returns:
+        Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
+    """
+    cols = []
+    data = []
+    for key, value in parsed_info.items():
+        if key.startswith(prefix):
+            cols.append(key)
+            data.append(value)
+
+    assert all([
+        len(xs) == len(data[0]) for xs in data
+    ]), ('mmCIF error: Not all loops are the same length: %s' % cols)
+
+    return [dict(zip(cols, xs)) for xs in zip(*data)]
+
+
+def mmcif_loop_to_dict(
+    prefix: str,
+    index: str,
+    parsed_info: MmCIFDict,
+) -> Mapping[str, Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a dictionary.
+
+    Args:
+        prefix: Prefix shared by each of the data items in the loop.
+            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+            _entity_poly_seq.mon_id. Should include the trailing period.
+        index: Which item of loop data should serve as the key.
+        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+            parser.
+
+    Returns:
+        Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
+        indexed by the index column.
+    """
+    entries = mmcif_loop_to_list(prefix, parsed_info)
+    return {entry[index]: entry for entry in entries}
+
+
+@functools.lru_cache(16, typed=False)
+def fast_parse(*,
+               file_id: str,
+               mmcif_string: str,
+               catch_all_errors: bool = True) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+
+    Args:
+        file_id: A string identifier for this file. Should be unique within the
+            collection of files being processed.
+        mmcif_string: Contents of an mmCIF file.
+        catch_all_errors: If True, all exceptions are caught and error messages are
+            returned as part of the ParsingResult. If False exceptions will be allowed
+            to propagate.
+
+    Returns:
+        A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = MMCIFParser(QUIET=True)
+        # handle = io.StringIO(mmcif_string)
+        # full_structure = parser.get_structure('', handle)
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+
+        header = _get_header(parsed_info)
+
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ''): 'No protein chains found in this file.'})
+
+        mmcif_to_author_chain_id = {}
+        # seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != '1':
+                # We only process the first model at the moment.
+                continue
+            mmcif_to_author_chain_id[
+                atom.mmcif_chain_id] = atom.author_chain_id
+
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=None,
+            chain_to_seqres=None,
+            seqres_to_structure=None,
+            raw_string=parsed_info,
+            mmcif_to_author_chain_id=mmcif_to_author_chain_id,
+            valid_chains=valid_chains,
+        )
+
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, '')] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+
+
+@functools.lru_cache(16, typed=False)
+def parse(*,
+          file_id: str,
+          mmcif_string: str,
+          catch_all_errors: bool = True) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+
+    Args:
+        file_id: A string identifier for this file. Should be unique within the
+            collection of files being processed.
+        mmcif_string: Contents of an mmCIF file.
+        catch_all_errors: If True, all exceptions are caught and error messages are
+            returned as part of the ParsingResult. If False exceptions will be allowed
+            to propagate.
+
+    Returns:
+        A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = PDB.MMCIFParser(QUIET=True)
+        handle = io.StringIO(mmcif_string)
+        full_structure = parser.get_structure('', handle)
+        first_model_structure = _get_first_model(full_structure)
+        # Extract the _mmcif_dict from the parser, which contains useful fields not
+        # reflected in the Biopython structure.
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+
+        header = _get_header(parsed_info)
+
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ''): 'No protein chains found in this file.'})
+        seq_start_num = {
+            chain_id: min([monomer.num for monomer in seq])
+            for chain_id, seq in valid_chains.items()
+        }
+
+        # Loop over the atoms for which we have coordinates. Populate two mappings:
+        # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
+        # the authors / Biopython).
+        # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
+        mmcif_to_author_chain_id = {}
+        seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != '1':
+                # We only process the first model at the moment.
+                continue
+
+            mmcif_to_author_chain_id[
+                atom.mmcif_chain_id] = atom.author_chain_id
+
+            if atom.mmcif_chain_id in valid_chains:
+                hetflag = ' '
+                if atom.hetatm_atom == 'HETATM':
+                    # Water atoms are assigned a special hetflag of W in Biopython. We
+                    # need to do the same, so that this hetflag can be used to fetch
+                    # a residue from the Biopython structure by id.
+                    if atom.residue_name in ('HOH', 'WAT'):
+                        hetflag = 'W'
+                    else:
+                        hetflag = 'H_' + atom.residue_name
+                insertion_code = atom.insertion_code
+                if not _is_set(atom.insertion_code):
+                    insertion_code = ' '
+                position = ResiduePosition(
+                    chain_id=atom.author_chain_id,
+                    residue_number=int(atom.author_seq_num),
+                    insertion_code=insertion_code,
+                )
+                seq_idx = int(
+                    atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
+                current = seq_to_structure_mappings.get(
+                    atom.author_chain_id, {})
+                current[seq_idx] = ResidueAtPosition(
+                    position=position,
+                    name=atom.residue_name,
+                    is_missing=False,
+                    hetflag=hetflag,
+                )
+                seq_to_structure_mappings[atom.author_chain_id] = current
+
+        # Add missing residue information to seq_to_structure_mappings.
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            current_mapping = seq_to_structure_mappings[author_chain]
+            for idx, monomer in enumerate(seq_info):
+                if idx not in current_mapping:
+                    current_mapping[idx] = ResidueAtPosition(
+                        position=None,
+                        name=monomer.id,
+                        is_missing=True,
+                        hetflag=' ')
+
+        author_chain_to_sequence = {}
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            seq = []
+            for monomer in seq_info:
+                code = SCOPData.protein_letters_3to1.get(monomer.id, 'X')
+                seq.append(code if len(code) == 1 else 'X')
+            seq = ''.join(seq)
+            author_chain_to_sequence[author_chain] = seq
+
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=first_model_structure,
+            chain_to_seqres=author_chain_to_sequence,
+            seqres_to_structure=seq_to_structure_mappings,
+            raw_string=parsed_info,
+            mmcif_to_author_chain_id=mmcif_to_author_chain_id,
+            valid_chains=valid_chains,
+        )
+
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, '')] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+
+
+def _get_first_model(structure: PdbStructure) -> PdbStructure:
+    """Returns the first model in a Biopython structure."""
+    return next(structure.get_models())
+
+
+_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
+
+
+def get_release_date(parsed_info: MmCIFDict) -> str:
+    """Returns the oldest revision date."""
+    revision_dates = parsed_info['_pdbx_audit_revision_history.revision_date']
+    return min(revision_dates)
+
+
+def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
+    """Returns a basic header containing method, release date and resolution."""
+    header = {}
+
+    experiments = mmcif_loop_to_list('_exptl.', parsed_info)
+    header['structure_method'] = ','.join(
+        [experiment['_exptl.method'].lower() for experiment in experiments])
+
+    # Note: The release_date here corresponds to the oldest revision. We prefer to
+    # use this for dataset filtering over the deposition_date.
+    if '_pdbx_audit_revision_history.revision_date' in parsed_info:
+        header['release_date'] = get_release_date(parsed_info)
+    else:
+        logging.warning('Could not determine release_date: %s',
+                        parsed_info['_entry.id'])
+
+    header['resolution'] = 0.00
+    for res_key in (
+            '_refine.ls_d_res_high',
+            '_em_3d_reconstruction.resolution',
+            '_reflns.d_resolution_high',
+    ):
+        if res_key in parsed_info:
+            try:
+                raw_resolution = parsed_info[res_key][0]
+                header['resolution'] = float(raw_resolution)
+            except ValueError:
+                logging.debug('Invalid resolution format: %s',
+                              parsed_info[res_key])
+
+    return header
+
+
+def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
+    """Returns list of atom sites; contains data not present in the structure."""
+    return [
+        AtomSite(*site) for site in zip(  # pylint:disable=g-complex-comprehension
+            parsed_info['_atom_site.label_comp_id'],
+            parsed_info['_atom_site.auth_asym_id'],
+            parsed_info['_atom_site.label_asym_id'],
+            parsed_info['_atom_site.auth_seq_id'],
+            parsed_info['_atom_site.label_seq_id'],
+            parsed_info['_atom_site.pdbx_PDB_ins_code'],
+            parsed_info['_atom_site.group_PDB'],
+            parsed_info['_atom_site.pdbx_PDB_model_num'],
+        )
+    ]
+
+
+def _get_protein_chains(
+        *, parsed_info: Mapping[str,
+                                Any]) -> Mapping[ChainId, Sequence[Monomer]]:
+    """Extracts polymer information for protein chains only.
+
+    Args:
+        parsed_info: _mmcif_dict produced by the Biopython parser.
+
+    Returns:
+        A dict mapping mmcif chain id to a list of Monomers.
+    """
+    # Get polymer information for each entity in the structure.
+    entity_poly_seqs = mmcif_loop_to_list('_entity_poly_seq.', parsed_info)
+
+    polymers = collections.defaultdict(list)
+    for entity_poly_seq in entity_poly_seqs:
+        polymers[entity_poly_seq['_entity_poly_seq.entity_id']].append(
+            Monomer(
+                id=entity_poly_seq['_entity_poly_seq.mon_id'],
+                num=int(entity_poly_seq['_entity_poly_seq.num']),
+            ))
+
+    # Get chemical compositions. Will allow us to identify which of these polymers
+    # are proteins.
+    chem_comps = mmcif_loop_to_dict('_chem_comp.', '_chem_comp.id',
+                                    parsed_info)
+
+    # Get chains information for each entity. Necessary so that we can return a
+    # dict keyed on chain id rather than entity.
+    struct_asyms = mmcif_loop_to_list('_struct_asym.', parsed_info)
+
+    entity_to_mmcif_chains = collections.defaultdict(list)
+    for struct_asym in struct_asyms:
+        chain_id = struct_asym['_struct_asym.id']
+        entity_id = struct_asym['_struct_asym.entity_id']
+        entity_to_mmcif_chains[entity_id].append(chain_id)
+
+    # Identify and return the valid protein chains.
+    valid_chains = {}
+    for entity_id, seq_info in polymers.items():
+        chain_ids = entity_to_mmcif_chains[entity_id]
+
+        # Reject polymers without any peptide-like components, such as DNA/RNA.
+        if any([
+                'peptide' in chem_comps[monomer.id]['_chem_comp.type']
+                for monomer in seq_info
+        ]):
+            for chain_id in chain_ids:
+                valid_chains[chain_id] = seq_info
+    return valid_chains
+
+
+def _is_set(data: str) -> bool:
+    """Returns False if data is a special mmCIF character indicating 'unset'."""
+    return data not in ('.', '?')
diff --git a/modelscope/models/science/unifold/msa/msa_identifiers.py b/modelscope/models/science/unifold/msa/msa_identifiers.py
new file mode 100644
index 00000000..366239db
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/msa_identifiers.py
@@ -0,0 +1,88 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for extracting identifiers from MSA sequence descriptions."""
+
+import dataclasses
+import re
+from typing import Optional
+
+# Sequences coming from UniProtKB database come in the
+# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
+_UNIPROT_PATTERN = re.compile(
+    r"""
+        ^
+        # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
+        (?:tr|sp)
+        \|
+        # A primary accession number of the UniProtKB entry.
+        (?P<AccessionIdentifier>[A-Za-z0-9]{6,10})
+        # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
+        (?:_\d)?
+        \|
+        # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
+        # protein ID code.
+        (?:[A-Za-z0-9]+)
+        _
+        # A mnemonic species identification code.
+        (?P<SpeciesIdentifier>([A-Za-z0-9]){1,5})
+        # Small BFD uses a final value after an underscore, which we ignore.
+        (?:_\d+)?
+        $
+        """,
+    re.VERBOSE,
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class Identifiers:
+    species_id: str = ''
+
+
+def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
+    """Gets accession id and species from an msa sequence identifier.
+
+    The sequence identifier has the format specified by
+    _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
+    An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+
+    Args:
+        msa_sequence_identifier: a sequence identifier.
+
+    Returns:
+        An `Identifiers` instance with a species_id. These
+        can be empty in the case where no identifier was found.
+    """
+    matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
+    if matches:
+        return Identifiers(species_id=matches.group('SpeciesIdentifier'))
+    return Identifiers()
+
+
+def _extract_sequence_identifier(description: str) -> Optional[str]:
+    """Extracts sequence identifier from description. Returns None if no match."""
+    split_description = description.split()
+    if split_description:
+        return split_description[0].partition('/')[0]
+    else:
+        return None
+
+
+def get_identifiers(description: str) -> Identifiers:
+    """Computes extra MSA features from the description."""
+    sequence_identifier = _extract_sequence_identifier(description)
+    if sequence_identifier is None:
+        return Identifiers()
+    else:
+        return _parse_sequence_identifier(sequence_identifier)
diff --git a/modelscope/models/science/unifold/msa/parsers.py b/modelscope/models/science/unifold/msa/parsers.py
new file mode 100644
index 00000000..bf36c816
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/parsers.py
@@ -0,0 +1,627 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for parsing various file formats."""
+import collections
+import dataclasses
+import itertools
+import re
+import string
+from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
+
+DeletionMatrix = Sequence[Sequence[int]]
+
+
+@dataclasses.dataclass(frozen=True)
+class Msa:
+    """Class representing a parsed MSA file."""
+
+    sequences: Sequence[str]
+    deletion_matrix: DeletionMatrix
+    descriptions: Sequence[str]
+
+    def __post_init__(self):
+        if not (len(self.sequences) == len(self.deletion_matrix) == len(
+                self.descriptions)):
+            raise ValueError(
+                'All fields for an MSA must have the same length. '
+                f'Got {len(self.sequences)} sequences, '
+                f'{len(self.deletion_matrix)} rows in the deletion matrix and '
+                f'{len(self.descriptions)} descriptions.')
+
+    def __len__(self):
+        return len(self.sequences)
+
+    def truncate(self, max_seqs: int):
+        return Msa(
+            sequences=self.sequences[:max_seqs],
+            deletion_matrix=self.deletion_matrix[:max_seqs],
+            descriptions=self.descriptions[:max_seqs],
+        )
+
+
+@dataclasses.dataclass(frozen=True)
+class TemplateHit:
+    """Class representing a template hit."""
+
+    index: int
+    name: str
+    aligned_cols: int
+    sum_probs: Optional[float]
+    query: str
+    hit_sequence: str
+    indices_query: List[int]
+    indices_hit: List[int]
+
+
+def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
+    """Parses FASTA string and returns list of strings with amino-acid sequences.
+
+    Arguments:
+        fasta_string: The string contents of a FASTA file.
+
+    Returns:
+        A tuple of two lists:
+        * A list of sequences.
+        * A list of sequence descriptions taken from the comment lines. In the
+            same order as the sequences.
+    """
+    sequences = []
+    descriptions = []
+    index = -1
+    for line in fasta_string.splitlines():
+        line = line.strip()
+        if line.startswith('>'):
+            index += 1
+            descriptions.append(line[1:])  # Remove the '>' at the beginning.
+            sequences.append('')
+            continue
+        elif not line:
+            continue  # Skip blank lines.
+        sequences[index] += line
+
+    return sequences, descriptions
+
+
+def parse_stockholm(stockholm_string: str) -> Msa:
+    """Parses sequences and deletion matrix from stockholm format alignment.
+
+    Args:
+        stockholm_string: The string contents of a stockholm file. The first
+            sequence in the file should be the query sequence.
+
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * The names of the targets matched, including the jackhmmer subsequence
+                suffix.
+    """
+    name_to_sequence = collections.OrderedDict()
+    for line in stockholm_string.splitlines():
+        line = line.strip()
+        if not line or line.startswith(('#', '//')):
+            continue
+        name, sequence = line.split()
+        if name not in name_to_sequence:
+            name_to_sequence[name] = ''
+        name_to_sequence[name] += sequence
+
+    msa = []
+    deletion_matrix = []
+
+    query = ''
+    keep_columns = []
+    for seq_index, sequence in enumerate(name_to_sequence.values()):
+        if seq_index == 0:
+            # Gather the columns with gaps from the query
+            query = sequence
+            keep_columns = [i for i, res in enumerate(query) if res != '-']
+
+        # Remove the columns with gaps in the query from all sequences.
+        aligned_sequence = ''.join([sequence[c] for c in keep_columns])
+
+        msa.append(aligned_sequence)
+
+        # Count the number of deletions w.r.t. query.
+        deletion_vec = []
+        deletion_count = 0
+        for seq_res, query_res in zip(sequence, query):
+            if seq_res != '-' or query_res != '-':
+                if query_res == '-':
+                    deletion_count += 1
+                else:
+                    deletion_vec.append(deletion_count)
+                    deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+
+    return Msa(
+        sequences=msa,
+        deletion_matrix=deletion_matrix,
+        descriptions=list(name_to_sequence.keys()),
+    )
+
+
+def parse_a3m(a3m_string: str) -> Msa:
+    """Parses sequences and deletion matrix from a3m format alignment.
+
+    Args:
+        a3m_string: The string contents of a a3m file. The first sequence in the
+            file should be the query sequence.
+
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * A list of descriptions, one per sequence, from the a3m file.
+    """
+    sequences, descriptions = parse_fasta(a3m_string)
+    deletion_matrix = []
+    for msa_sequence in sequences:
+        deletion_vec = []
+        deletion_count = 0
+        for j in msa_sequence:
+            if j.islower():
+                deletion_count += 1
+            else:
+                deletion_vec.append(deletion_count)
+                deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+
+    # Make the MSA matrix out of aligned (deletion-free) sequences.
+    deletion_table = str.maketrans('', '', string.ascii_lowercase)
+    aligned_sequences = [s.translate(deletion_table) for s in sequences]
+    return Msa(
+        sequences=aligned_sequences,
+        deletion_matrix=deletion_matrix,
+        descriptions=descriptions,
+    )
+
+
+def _convert_sto_seq_to_a3m(query_non_gaps: Sequence[bool],
+                            sto_seq: str) -> Iterable[str]:
+    for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
+        if is_query_res_non_gap:
+            yield sequence_res
+        elif sequence_res != '-':
+            yield sequence_res.lower()
+
+
+def convert_stockholm_to_a3m(
+    stockholm_format: str,
+    max_sequences: Optional[int] = None,
+    remove_first_row_gaps: bool = True,
+) -> str:
+    """Converts MSA in Stockholm format to the A3M format."""
+    descriptions = {}
+    sequences = {}
+    reached_max_sequences = False
+
+    for line in stockholm_format.splitlines():
+        reached_max_sequences = max_sequences and len(
+            sequences) >= max_sequences
+        if line.strip() and not line.startswith(('#', '//')):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname, aligned_seq = line.split(maxsplit=1)
+            if seqname not in sequences:
+                if reached_max_sequences:
+                    continue
+                sequences[seqname] = ''
+            sequences[seqname] += aligned_seq
+
+    for line in stockholm_format.splitlines():
+        if line[:4] == '#=GS':
+            # Description row - example format is:
+            # #=GS UniRef90_Q9H5Z4/4-78                        DE [subseq from] cDNA: FLJ22755 ...
+            columns = line.split(maxsplit=3)
+            seqname, feature = columns[1:3]
+            value = columns[3] if len(columns) == 4 else ''
+            if feature != 'DE':
+                continue
+            if reached_max_sequences and seqname not in sequences:
+                continue
+            descriptions[seqname] = value
+            if len(descriptions) == len(sequences):
+                break
+
+    # Convert sto format to a3m line by line
+    a3m_sequences = {}
+    if remove_first_row_gaps:
+        # query_sequence is assumed to be the first sequence
+        query_sequence = next(iter(sequences.values()))
+        query_non_gaps = [res != '-' for res in query_sequence]
+    for seqname, sto_sequence in sequences.items():
+        # Dots are optional in a3m format and are commonly removed.
+        out_sequence = sto_sequence.replace('.', '')
+        if remove_first_row_gaps:
+            out_sequence = ''.join(
+                _convert_sto_seq_to_a3m(query_non_gaps, out_sequence))
+        a3m_sequences[seqname] = out_sequence
+
+    fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
+                    for k in a3m_sequences)
+    return '\n'.join(fasta_chunks) + '\n'  # Include terminating newline.
+
+
+def _keep_line(line: str, seqnames: Set[str]) -> bool:
+    """Function to decide which lines to keep."""
+    if not line.strip():
+        return True
+    if line.strip() == '//':  # End tag
+        return True
+    if line.startswith('# STOCKHOLM'):  # Start tag
+        return True
+    if line.startswith('#=GC RF'):  # Reference Annotation Line
+        return True
+    if line[:4] == '#=GS':  # Description lines - keep if sequence in list.
+        _, seqname, _ = line.split(maxsplit=2)
+        return seqname in seqnames
+    elif line.startswith('#'):  # Other markup - filter out
+        return False
+    else:  # Alignment data - keep if sequence in list.
+        seqname = line.partition(' ')[0]
+        return seqname in seqnames
+
+
+def truncate_stockholm_msa(stockholm_msa: str, max_sequences: int) -> str:
+    """Truncates a stockholm file to a maximum number of sequences."""
+    seqnames = set()
+    filtered_lines = []
+    for line in stockholm_msa.splitlines():
+        if line.strip() and not line.startswith(('#', '//')):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname = line.partition(' ')[0]
+            seqnames.add(seqname)
+            if len(seqnames) >= max_sequences:
+                break
+
+    for line in stockholm_msa.splitlines():
+        if _keep_line(line, seqnames):
+            filtered_lines.append(line)
+
+    return '\n'.join(filtered_lines) + '\n'
+
+
+def remove_empty_columns_from_stockholm_msa(stockholm_msa: str) -> str:
+    """Removes empty columns (dashes-only) from a Stockholm MSA."""
+    processed_lines = {}
+    unprocessed_lines = {}
+    for i, line in enumerate(stockholm_msa.splitlines()):
+        if line.startswith('#=GC RF'):
+            reference_annotation_i = i
+            reference_annotation_line = line
+            # Reached the end of this chunk of the alignment. Process chunk.
+            _, _, first_alignment = line.rpartition(' ')
+            mask = []
+            for j in range(len(first_alignment)):
+                for _, unprocessed_line in unprocessed_lines.items():
+                    prefix, _, alignment = unprocessed_line.rpartition(' ')
+                    if alignment[j] != '-':
+                        mask.append(True)
+                        break
+                else:  # Every row contained a hyphen - empty column.
+                    mask.append(False)
+            # Add reference annotation for processing with mask.
+            unprocessed_lines[
+                reference_annotation_i] = reference_annotation_line
+
+            if not any(
+                    mask
+            ):  # All columns were empty. Output empty lines for chunk.
+                for line_index in unprocessed_lines:
+                    processed_lines[line_index] = ''
+            else:
+                for line_index, unprocessed_line in unprocessed_lines.items():
+                    prefix, _, alignment = unprocessed_line.rpartition(' ')
+                    masked_alignment = ''.join(
+                        itertools.compress(alignment, mask))
+                    processed_lines[
+                        line_index] = f'{prefix} {masked_alignment}'
+
+            # Clear raw_alignments.
+            unprocessed_lines = {}
+        elif line.strip() and not line.startswith(('#', '//')):
+            unprocessed_lines[i] = line
+        else:
+            processed_lines[i] = line
+    return '\n'.join((processed_lines[i] for i in range(len(processed_lines))))
+
+
+def deduplicate_stockholm_msa(stockholm_msa: str) -> str:
+    """Remove duplicate sequences (ignoring insertions wrt query)."""
+    sequence_dict = collections.defaultdict(str)
+
+    # First we must extract all sequences from the MSA.
+    for line in stockholm_msa.splitlines():
+        # Only consider the alignments - ignore reference annotation, empty lines,
+        # descriptions or markup.
+        if line.strip() and not line.startswith(('#', '//')):
+            line = line.strip()
+            seqname, alignment = line.split()
+            sequence_dict[seqname] += alignment
+
+    seen_sequences = set()
+    seqnames = set()
+    # First alignment is the query.
+    query_align = next(iter(sequence_dict.values()))
+    mask = [c != '-' for c in query_align]  # Mask is False for insertions.
+    for seqname, alignment in sequence_dict.items():
+        # Apply mask to remove all insertions from the string.
+        masked_alignment = ''.join(itertools.compress(alignment, mask))
+        if masked_alignment in seen_sequences:
+            continue
+        else:
+            seen_sequences.add(masked_alignment)
+            seqnames.add(seqname)
+
+    filtered_lines = []
+    for line in stockholm_msa.splitlines():
+        if _keep_line(line, seqnames):
+            filtered_lines.append(line)
+
+    return '\n'.join(filtered_lines) + '\n'
+
+
+def _get_hhr_line_regex_groups(regex_pattern: str,
+                               line: str) -> Sequence[Optional[str]]:
+    match = re.match(regex_pattern, line)
+    if match is None:
+        raise RuntimeError(f'Could not parse query line {line}')
+    return match.groups()
+
+
+def _update_hhr_residue_indices_list(sequence: str, start_index: int,
+                                     indices_list: List[int]):
+    """Computes the relative indices for each residue with respect to the original sequence."""
+    counter = start_index
+    for symbol in sequence:
+        if symbol == '-':
+            indices_list.append(-1)
+        else:
+            indices_list.append(counter)
+            counter += 1
+
+
+def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
+    """Parses the detailed HMM HMM comparison section for a single Hit.
+
+    This works on .hhr files generated from both HHBlits and HHSearch.
+
+    Args:
+        detailed_lines: A list of lines from a single comparison section between 2
+            sequences (which each have their own HMM's)
+
+    Returns:
+        A dictionary with the information from that detailed comparison section
+
+    Raises:
+        RuntimeError: If a certain line cannot be processed
+    """
+    # Parse first 2 lines.
+    number_of_hit = int(detailed_lines[0].split()[-1])
+    name_hit = detailed_lines[1][1:]
+
+    # Parse the summary line.
+    pattern = (
+        'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
+        ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t '
+        ']*Template_Neff=(.*)')
+    match = re.match(pattern, detailed_lines[2])
+    if match is None:
+        raise RuntimeError(
+            'Could not parse section: %s. Expected this: \n%s to contain summary.'
+            % (detailed_lines, detailed_lines[2]))
+    (_, _, _, aligned_cols, _, _, sum_probs,
+     _) = [float(x) for x in match.groups()]
+
+    # The next section reads the detailed comparisons. These are in a 'human
+    # readable' format which has a fixed length. The strategy employed is to
+    # assume that each block starts with the query sequence line, and to parse
+    # that with a regexp in order to deduce the fixed length used for that block.
+    query = ''
+    hit_sequence = ''
+    indices_query = []
+    indices_hit = []
+    length_block = None
+
+    for line in detailed_lines[3:]:
+        # Parse the query sequence line
+        if (line.startswith('Q ') and not line.startswith('Q ss_dssp')
+                and not line.startswith('Q ss_pred')
+                and not line.startswith('Q Consensus')):
+            # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
+            # everything after that.
+            #                            start        sequence             end             total_sequence_length
+            patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)'
+            groups = _get_hhr_line_regex_groups(patt, line[17:])
+
+            # Get the length of the parsed block using the start and finish indices,
+            # and ensure it is the same as the actual block length.
+            start = int(groups[0]) - 1  # Make index zero based.
+            delta_query = groups[1]
+            end = int(groups[2])
+            num_insertions = len([x for x in delta_query if x == '-'])
+            length_block = end - start + num_insertions
+            assert length_block == len(delta_query)
+
+            # Update the query sequence and indices list.
+            query += delta_query
+            _update_hhr_residue_indices_list(delta_query, start, indices_query)
+
+        elif line.startswith('T '):
+            # Parse the hit sequence.
+            if (not line.startswith('T ss_dssp')
+                    and not line.startswith('T ss_pred')
+                    and not line.startswith('T Consensus')):
+                # Thus the first 17 characters must be 'T <hit_name> ', and we can
+                # parse everything after that.
+                #                            start        sequence             end         total_sequence_length
+                patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)'
+                groups = _get_hhr_line_regex_groups(patt, line[17:])
+                start = int(groups[0]) - 1  # Make index zero based.
+                delta_hit_sequence = groups[1]
+                assert length_block == len(delta_hit_sequence)
+
+                # Update the hit sequence and indices list.
+                hit_sequence += delta_hit_sequence
+                _update_hhr_residue_indices_list(delta_hit_sequence, start,
+                                                 indices_hit)
+
+    return TemplateHit(
+        index=number_of_hit,
+        name=name_hit,
+        aligned_cols=int(aligned_cols),
+        sum_probs=sum_probs,
+        query=query,
+        hit_sequence=hit_sequence,
+        indices_query=indices_query,
+        indices_hit=indices_hit,
+    )
+
+
+def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
+    """Parses the content of an entire HHR file."""
+    lines = hhr_string.splitlines()
+
+    # Each .hhr file starts with a results table, then has a sequence of hit
+    # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
+    # iterate through each paragraph to parse each hit.
+
+    block_starts = [
+        i for i, line in enumerate(lines) if line.startswith('No ')
+    ]
+
+    hits = []
+    if block_starts:
+        block_starts.append(len(lines))  # Add the end of the final block.
+        for i in range(len(block_starts) - 1):
+            hits.append(
+                _parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
+    return hits
+
+
+def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
+    """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
+    e_values = {'query': 0}
+    lines = [line for line in tblout.splitlines() if line[0] != '#']
+    # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
+    # space-delimited. Relevant fields are (1) target name:    and
+    # (5) E-value (full sequence) (numbering from 1).
+    for line in lines:
+        fields = line.split()
+        e_value = fields[4]
+        target_name = fields[0]
+        e_values[target_name] = float(e_value)
+    return e_values
+
+
+def _get_indices(sequence: str, start: int) -> List[int]:
+    """Returns indices for non-gap/insert residues starting at the given index."""
+    indices = []
+    counter = start
+    for symbol in sequence:
+        # Skip gaps but add a placeholder so that the alignment is preserved.
+        if symbol == '-':
+            indices.append(-1)
+        # Skip deleted residues, but increase the counter.
+        elif symbol.islower():
+            counter += 1
+        # Normal aligned residue. Increase the counter and append to indices.
+        else:
+            indices.append(counter)
+            counter += 1
+    return indices
+
+
+@dataclasses.dataclass(frozen=True)
+class HitMetadata:
+    pdb_id: str
+    chain: str
+    start: int
+    end: int
+    length: int
+    text: str
+
+
+def _parse_hmmsearch_description(description: str) -> HitMetadata:
+    """Parses the hmmsearch A3M sequence description line."""
+    # Example 1: >4pqx_A/2-217 [subseq from] mol:protein length:217    Free text
+    # Example 2: >5g3r_A/1-55 [subseq from] mol:protein length:352
+    match = re.match(
+        r'^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$',
+        description.strip(),
+    )
+
+    if not match:
+        raise ValueError(f'Could not parse description: "{description}".')
+
+    return HitMetadata(
+        pdb_id=match[1],
+        chain=match[2],
+        start=int(match[3]),
+        end=int(match[4]),
+        length=int(match[5]),
+        text=match[6],
+    )
+
+
+def parse_hmmsearch_a3m(query_sequence: str,
+                        a3m_string: str,
+                        skip_first: bool = True) -> Sequence[TemplateHit]:
+    """Parses an a3m string produced by hmmsearch.
+
+    Args:
+        query_sequence: The query sequence.
+        a3m_string: The a3m string produced by hmmsearch.
+        skip_first: Whether to skip the first sequence in the a3m string.
+
+    Returns:
+        A sequence of `TemplateHit` results.
+    """
+    # Zip the descriptions and MSAs together, skip the first query sequence.
+    parsed_a3m = list(zip(*parse_fasta(a3m_string)))
+    if skip_first:
+        parsed_a3m = parsed_a3m[1:]
+
+    indices_query = _get_indices(query_sequence, start=0)
+
+    hits = []
+    for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
+        if 'mol:protein' not in hit_description:
+            continue  # Skip non-protein chains.
+        metadata = _parse_hmmsearch_description(hit_description)
+        # Aligned columns are only the match states.
+        aligned_cols = sum([r.isupper() and r != '-' for r in hit_sequence])
+        indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)
+
+        hit = TemplateHit(
+            index=i,
+            name=f'{metadata.pdb_id}_{metadata.chain}',
+            aligned_cols=aligned_cols,
+            sum_probs=None,
+            query=query_sequence,
+            hit_sequence=hit_sequence.upper(),
+            indices_query=indices_query,
+            indices_hit=indices_hit,
+        )
+        hits.append(hit)
+
+    return hits
diff --git a/modelscope/models/science/unifold/msa/pipeline.py b/modelscope/models/science/unifold/msa/pipeline.py
new file mode 100644
index 00000000..b7889bff
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/pipeline.py
@@ -0,0 +1,282 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for building the input features for the unifold model."""
+
+import os
+from typing import Any, Mapping, MutableMapping, Optional, Sequence, Union
+
+import numpy as np
+from absl import logging
+
+from modelscope.models.science.unifold.data import residue_constants
+from modelscope.models.science.unifold.msa import (msa_identifiers, parsers,
+                                                   templates)
+from modelscope.models.science.unifold.msa.tools import (hhblits, hhsearch,
+                                                         hmmsearch, jackhmmer)
+
+FeatureDict = MutableMapping[str, np.ndarray]
+TemplateSearcher = Union[hhsearch.HHSearch, hmmsearch.Hmmsearch]
+
+
+def make_sequence_features(sequence: str, description: str,
+                           num_res: int) -> FeatureDict:
+    """Constructs a feature dict of sequence features."""
+    features = {}
+    features['aatype'] = residue_constants.sequence_to_onehot(
+        sequence=sequence,
+        mapping=residue_constants.restype_order_with_x,
+        map_unknown_to_x=True,
+    )
+    features['between_segment_residues'] = np.zeros((num_res, ),
+                                                    dtype=np.int32)
+    features['domain_name'] = np.array([description.encode('utf-8')],
+                                       dtype=np.object_)
+    features['residue_index'] = np.array(range(num_res), dtype=np.int32)
+    features['seq_length'] = np.array([num_res] * num_res, dtype=np.int32)
+    features['sequence'] = np.array([sequence.encode('utf-8')],
+                                    dtype=np.object_)
+    return features
+
+
+def make_msa_features(msas: Sequence[parsers.Msa]) -> FeatureDict:
+    """Constructs a feature dict of MSA features."""
+    if not msas:
+        raise ValueError('At least one MSA must be provided.')
+
+    int_msa = []
+    deletion_matrix = []
+    species_ids = []
+    seen_sequences = set()
+    for msa_index, msa in enumerate(msas):
+        if not msa:
+            raise ValueError(
+                f'MSA {msa_index} must contain at least one sequence.')
+        for sequence_index, sequence in enumerate(msa.sequences):
+            if sequence in seen_sequences:
+                continue
+            seen_sequences.add(sequence)
+            int_msa.append(
+                [residue_constants.HHBLITS_AA_TO_ID[res] for res in sequence])
+            deletion_matrix.append(msa.deletion_matrix[sequence_index])
+            identifiers = msa_identifiers.get_identifiers(
+                msa.descriptions[sequence_index])
+            species_ids.append(identifiers.species_id.encode('utf-8'))
+
+    num_res = len(msas[0].sequences[0])
+    num_alignments = len(int_msa)
+    features = {}
+    features['deletion_matrix_int'] = np.array(deletion_matrix, dtype=np.int32)
+    features['msa'] = np.array(int_msa, dtype=np.int32)
+    features['num_alignments'] = np.array(
+        [num_alignments] * num_res, dtype=np.int32)
+    features['msa_species_identifiers'] = np.array(
+        species_ids, dtype=np.object_)
+    return features
+
+
+def run_msa_tool(
+    msa_runner,
+    input_fasta_path: str,
+    msa_out_path: str,
+    msa_format: str,
+    use_precomputed_msas: bool,
+) -> Mapping[str, Any]:
+    """Runs an MSA tool, checking if output already exists first."""
+    if not use_precomputed_msas or not os.path.exists(msa_out_path):
+        result = msa_runner.query(input_fasta_path)[0]
+        with open(msa_out_path, 'w') as f:
+            f.write(result[msa_format])
+    else:
+        logging.warning('Reading MSA from file %s', msa_out_path)
+        with open(msa_out_path, 'r') as f:
+            result = {msa_format: f.read()}
+    return result
+
+
+class DataPipeline:
+    """Runs the alignment tools and assembles the input features."""
+
+    def __init__(
+        self,
+        jackhmmer_binary_path: str,
+        hhblits_binary_path: str,
+        uniref90_database_path: str,
+        mgnify_database_path: str,
+        bfd_database_path: Optional[str],
+        uniclust30_database_path: Optional[str],
+        small_bfd_database_path: Optional[str],
+        uniprot_database_path: Optional[str],
+        template_searcher: TemplateSearcher,
+        template_featurizer: templates.TemplateHitFeaturizer,
+        use_small_bfd: bool,
+        mgnify_max_hits: int = 501,
+        uniref_max_hits: int = 10000,
+        use_precomputed_msas: bool = False,
+    ):
+        """Initializes the data pipeline."""
+        self._use_small_bfd = use_small_bfd
+        self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
+            binary_path=jackhmmer_binary_path,
+            database_path=uniref90_database_path)
+        if use_small_bfd:
+            self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
+                binary_path=jackhmmer_binary_path,
+                database_path=small_bfd_database_path)
+        else:
+            self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
+                binary_path=hhblits_binary_path,
+                databases=[bfd_database_path, uniclust30_database_path],
+            )
+        self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
+            binary_path=jackhmmer_binary_path,
+            database_path=mgnify_database_path)
+        self.jackhmmer_uniprot_runner = jackhmmer.Jackhmmer(
+            binary_path=jackhmmer_binary_path,
+            database_path=uniprot_database_path)
+        self.template_searcher = template_searcher
+        self.template_featurizer = template_featurizer
+        self.mgnify_max_hits = mgnify_max_hits
+        self.uniref_max_hits = uniref_max_hits
+        self.use_precomputed_msas = use_precomputed_msas
+
+    def process(self, input_fasta_path: str,
+                msa_output_dir: str) -> FeatureDict:
+        """Runs alignment tools on the input sequence and creates features."""
+        with open(input_fasta_path) as f:
+            input_fasta_str = f.read()
+        input_seqs, input_descs = parsers.parse_fasta(input_fasta_str)
+        if len(input_seqs) != 1:
+            raise ValueError(
+                f'More than one input sequence found in {input_fasta_path}.')
+        input_sequence = input_seqs[0]
+        input_description = input_descs[0]
+        num_res = len(input_sequence)
+
+        uniref90_out_path = os.path.join(msa_output_dir, 'uniref90_hits.sto')
+        jackhmmer_uniref90_result = run_msa_tool(
+            self.jackhmmer_uniref90_runner,
+            input_fasta_path,
+            uniref90_out_path,
+            'sto',
+            self.use_precomputed_msas,
+        )
+        mgnify_out_path = os.path.join(msa_output_dir, 'mgnify_hits.sto')
+        jackhmmer_mgnify_result = run_msa_tool(
+            self.jackhmmer_mgnify_runner,
+            input_fasta_path,
+            mgnify_out_path,
+            'sto',
+            self.use_precomputed_msas,
+        )
+
+        msa_for_templates = jackhmmer_uniref90_result['sto']
+        msa_for_templates = parsers.truncate_stockholm_msa(
+            msa_for_templates, max_sequences=self.uniref_max_hits)
+        msa_for_templates = parsers.deduplicate_stockholm_msa(
+            msa_for_templates)
+        msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(
+            msa_for_templates)
+
+        if self.template_searcher.input_format == 'sto':
+            pdb_templates_result = self.template_searcher.query(
+                msa_for_templates)
+        elif self.template_searcher.input_format == 'a3m':
+            uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(
+                msa_for_templates)
+            pdb_templates_result = self.template_searcher.query(
+                uniref90_msa_as_a3m)
+        else:
+            raise ValueError('Unrecognized template input format: '
+                             f'{self.template_searcher.input_format}')
+
+        pdb_hits_out_path = os.path.join(
+            msa_output_dir, f'pdb_hits.{self.template_searcher.output_format}')
+        with open(pdb_hits_out_path, 'w') as f:
+            f.write(pdb_templates_result)
+
+        uniref90_msa = parsers.parse_stockholm(
+            jackhmmer_uniref90_result['sto'])
+        uniref90_msa = uniref90_msa.truncate(max_seqs=self.uniref_max_hits)
+        mgnify_msa = parsers.parse_stockholm(jackhmmer_mgnify_result['sto'])
+        mgnify_msa = mgnify_msa.truncate(max_seqs=self.mgnify_max_hits)
+
+        pdb_template_hits = self.template_searcher.get_template_hits(
+            output_string=pdb_templates_result, input_sequence=input_sequence)
+
+        if self._use_small_bfd:
+            bfd_out_path = os.path.join(msa_output_dir, 'small_bfd_hits.sto')
+            jackhmmer_small_bfd_result = run_msa_tool(
+                self.jackhmmer_small_bfd_runner,
+                input_fasta_path,
+                bfd_out_path,
+                'sto',
+                self.use_precomputed_msas,
+            )
+            bfd_msa = parsers.parse_stockholm(
+                jackhmmer_small_bfd_result['sto'])
+        else:
+            bfd_out_path = os.path.join(msa_output_dir,
+                                        'bfd_uniclust_hits.a3m')
+            hhblits_bfd_uniclust_result = run_msa_tool(
+                self.hhblits_bfd_uniclust_runner,
+                input_fasta_path,
+                bfd_out_path,
+                'a3m',
+                self.use_precomputed_msas,
+            )
+            bfd_msa = parsers.parse_a3m(hhblits_bfd_uniclust_result['a3m'])
+
+        templates_result = self.template_featurizer.get_templates(
+            query_sequence=input_sequence, hits=pdb_template_hits)
+
+        sequence_features = make_sequence_features(
+            sequence=input_sequence,
+            description=input_description,
+            num_res=num_res)
+
+        msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa))
+
+        logging.info('Uniref90 MSA size: %d sequences.', len(uniref90_msa))
+        logging.info('BFD MSA size: %d sequences.', len(bfd_msa))
+        logging.info('MGnify MSA size: %d sequences.', len(mgnify_msa))
+        logging.info(
+            'Final (deduplicated) MSA size: %d sequences.',
+            msa_features['num_alignments'][0],
+        )
+        logging.info(
+            'Total number of templates (NB: this can include bad '
+            'templates and is later filtered to top 4): %d.',
+            templates_result.features['template_domain_names'].shape[0],
+        )
+
+        return {
+            **sequence_features,
+            **msa_features,
+            **templates_result.features
+        }
+
+    def process_uniprot(self, input_fasta_path: str,
+                        msa_output_dir: str) -> FeatureDict:
+        uniprot_path = os.path.join(msa_output_dir, 'uniprot_hits.sto')
+        uniprot_result = run_msa_tool(
+            self.jackhmmer_uniprot_runner,
+            input_fasta_path,
+            uniprot_path,
+            'sto',
+            self.use_precomputed_msas,
+        )
+        msa = parsers.parse_stockholm(uniprot_result['sto'])
+        msa = msa.truncate(max_seqs=50000)
+        all_seq_dict = make_msa_features([msa])
+        return all_seq_dict
diff --git a/modelscope/models/science/unifold/msa/templates.py b/modelscope/models/science/unifold/msa/templates.py
new file mode 100644
index 00000000..fe3bcef9
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/templates.py
@@ -0,0 +1,1110 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions for getting templates and calculating template features."""
+import abc
+import dataclasses
+import datetime
+import functools
+import glob
+import os
+import re
+from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
+
+import numpy as np
+from absl import logging
+
+from modelscope.models.science.unifold.data import residue_constants
+from modelscope.models.science.unifold.msa import mmcif, parsers
+from modelscope.models.science.unifold.msa.tools import kalign
+
+
+class Error(Exception):
+    """Base class for exceptions."""
+
+
+class NoChainsError(Error):
+    """An error indicating that template mmCIF didn't have any chains."""
+
+
+class SequenceNotInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain the sequence."""
+
+
+class NoAtomDataInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain atom positions."""
+
+
+class TemplateAtomMaskAllZerosError(Error):
+    """An error indicating that template mmCIF had all atom positions masked."""
+
+
+class QueryToTemplateAlignError(Error):
+    """An error indicating that the query can't be aligned to the template."""
+
+
+class CaDistanceError(Error):
+    """An error indicating that a CA atom distance exceeds a threshold."""
+
+
+class MultipleChainsError(Error):
+    """An error indicating that multiple chains were found for a given ID."""
+
+
+# Prefilter exceptions.
+class PrefilterError(Exception):
+    """A base class for template prefilter exceptions."""
+
+
+class DateError(PrefilterError):
+    """An error indicating that the hit date was after the max allowed date."""
+
+
+class AlignRatioError(PrefilterError):
+    """An error indicating that the hit align ratio to the query was too small."""
+
+
+class DuplicateError(PrefilterError):
+    """An error indicating that the hit was an exact subsequence of the query."""
+
+
+class LengthError(PrefilterError):
+    """An error indicating that the hit was too short."""
+
+
+TEMPLATE_FEATURES = {
+    'template_aatype': np.float32,
+    'template_all_atom_mask': np.float32,
+    'template_all_atom_positions': np.float32,
+    'template_domain_names': np.object_,
+    'template_sequence': np.object_,
+    'template_sum_probs': np.float32,
+}
+
+
+def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
+    """Returns PDB id and chain id for an HHSearch Hit."""
+    # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
+    id_match = re.match(r'[a-zA-Z\d]{4}_[a-zA-Z0-9.]+', hit.name)
+    if not id_match:
+        raise ValueError(
+            f'hit.name did not start with PDBID_chain: {hit.name}')
+    pdb_id, chain_id = id_match.group(0).split('_')
+    return pdb_id.lower(), chain_id
+
+
+def _is_after_cutoff(
+    pdb_id: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: Optional[datetime.datetime],
+) -> bool:
+    """Checks if the template date is after the release date cutoff.
+
+    Args:
+        pdb_id: 4 letter pdb code.
+        release_dates: Dictionary mapping PDB ids to their structure release dates.
+        release_date_cutoff: Max release date that is valid for this query.
+
+    Returns:
+        True if the template release date is after the cutoff, False otherwise.
+    """
+    if release_date_cutoff is None:
+        raise ValueError('The release_date_cutoff must not be None.')
+    if pdb_id in release_dates:
+        return release_dates[pdb_id] > release_date_cutoff
+    else:
+        # Since this is just a quick prefilter to reduce the number of mmCIF files
+        # we need to parse, we don't have to worry about returning True here.
+        return False
+
+
+def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, Optional[str]]:
+    """Parses the data file from PDB that lists which pdb_ids are obsolete."""
+    with open(obsolete_file_path) as f:
+        result = {}
+        for line in f:
+            line = line.strip()
+            # Format:        Date            From         To
+            # 'OBSLTE        06-NOV-19 6G9Y'                                - Removed, rare
+            # 'OBSLTE        31-JUL-94 116L         216L'             - Replaced, common
+            # 'OBSLTE        26-SEP-06 2H33         2JM5 2OWI'    - Replaced by multiple, rare
+            if line.startswith('OBSLTE'):
+                if len(line) > 30:
+                    # Replaced by at least one structure.
+                    from_id = line[20:24].lower()
+                    to_id = line[29:33].lower()
+                    result[from_id] = to_id
+                elif len(line) == 24:
+                    # Removed.
+                    from_id = line[20:24].lower()
+                    result[from_id] = None
+        return result
+
+
+def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
+    """Parses release dates file, returns a mapping from PDBs to release dates."""
+    if path.endswith('txt'):
+        release_dates = {}
+        with open(path, 'r') as f:
+            for line in f:
+                pdb_id, date = line.split(':')
+                date = date.strip()
+                # Python 3.6 doesn't have datetime.date.fromisoformat() which is about
+                # 90x faster than strptime. However, splitting the string manually is
+                # about 10x faster than strptime.
+                release_dates[pdb_id.strip()] = datetime.datetime(
+                    year=int(date[:4]),
+                    month=int(date[5:7]),
+                    day=int(date[8:10]))
+        return release_dates
+    else:
+        raise ValueError('Invalid format of the release date file %s.' % path)
+
+
+def _assess_hhsearch_hit(
+    hit: parsers.TemplateHit,
+    hit_pdb_code: str,
+    query_sequence: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: datetime.datetime,
+    max_subsequence_ratio: float = 0.95,
+    min_align_ratio: float = 0.1,
+) -> bool:
+    """Determines if template is valid (without parsing the template mmcif file).
+
+    Args:
+        hit: HhrHit for the template.
+        hit_pdb_code: The 4 letter pdb code of the template hit. This might be
+            different from the value in the actual hit since the original pdb might
+            have become obsolete.
+        query_sequence: Amino acid sequence of the query.
+        release_dates: Dictionary mapping pdb codes to their structure release
+            dates.
+        release_date_cutoff: Max release date that is valid for this query.
+        max_subsequence_ratio: Exclude any exact matches with this much overlap.
+        min_align_ratio: Minimum overlap between the template and query.
+
+    Returns:
+        True if the hit passed the prefilter. Raises an exception otherwise.
+
+    Raises:
+        DateError: If the hit date was after the max allowed date.
+        AlignRatioError: If the hit align ratio to the query was too small.
+        DuplicateError: If the hit was an exact subsequence of the query.
+        LengthError: If the hit was too short.
+    """
+    aligned_cols = hit.aligned_cols
+    align_ratio = aligned_cols / len(query_sequence)
+
+    template_sequence = hit.hit_sequence.replace('-', '')
+    length_ratio = float(len(template_sequence)) / len(query_sequence)
+
+    # Check whether the template is a large subsequence or duplicate of original
+    # query. This can happen due to duplicate entries in the PDB database.
+    duplicate = (
+        template_sequence in query_sequence
+        and length_ratio > max_subsequence_ratio)
+
+    if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
+        raise DateError(
+            f'Date ({release_dates[hit_pdb_code]}) > max template date '
+            f'({release_date_cutoff}).')
+
+    if align_ratio <= min_align_ratio:
+        raise AlignRatioError(
+            'Proportion of residues aligned to query too small. '
+            f'Align ratio: {align_ratio}.')
+
+    if duplicate:
+        raise DuplicateError(
+            'Template is an exact subsequence of query with large '
+            f'coverage. Length ratio: {length_ratio}.')
+
+    if len(template_sequence) < 10:
+        raise LengthError(
+            f'Template too short. Length: {len(template_sequence)}.')
+
+    return True
+
+
+def _find_template_in_pdb(
+        template_chain_id: str, template_sequence: str,
+        mmcif_object: mmcif.MmcifObject) -> Tuple[str, str, int]:
+    """Tries to find the template chain in the given pdb file.
+
+    This method tries the three following things in order:
+        1. Tries if there is an exact match in both the chain ID and the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        2. Tries if there is an exact match only in the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
+             If yes, the chain sequence is returned.
+    If none of these succeed, a SequenceNotInTemplateError is thrown.
+
+    Args:
+        template_chain_id: The template chain ID.
+        template_sequence: The template chain sequence.
+        mmcif_object: The PDB object to search for the template in.
+
+    Returns:
+        A tuple with:
+        * The chain sequence that was found to match the template in the PDB object.
+        * The ID of the chain that is being returned.
+        * The offset where the template sequence starts in the chain sequence.
+
+    Raises:
+        SequenceNotInTemplateError: If no match is found after the steps described
+            above.
+    """
+    # Try if there is an exact match in both the chain ID and the (sub)sequence.
+    pdb_id = mmcif_object.file_id
+    chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
+    if chain_sequence and (template_sequence in chain_sequence):
+        logging.info('Found an exact template match %s_%s.', pdb_id,
+                     template_chain_id)
+        mapping_offset = chain_sequence.find(template_sequence)
+        return chain_sequence, template_chain_id, mapping_offset
+
+    # Try if there is an exact match in the (sub)sequence only.
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        if chain_sequence and (template_sequence in chain_sequence):
+            logging.info('Found a sequence-only match %s_%s.', pdb_id,
+                         chain_id)
+            mapping_offset = chain_sequence.find(template_sequence)
+            return chain_sequence, chain_id, mapping_offset
+
+    # Return a chain sequence that fuzzy matches (X = wildcard) the template.
+    # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
+    regex = ['.' if aa == 'X' else '(?:%s|X)' % aa for aa in template_sequence]
+    regex = re.compile(''.join(regex))
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        match = re.search(regex, chain_sequence)
+        if match:
+            logging.info('Found a fuzzy sequence-only match %s_%s.', pdb_id,
+                         chain_id)
+            mapping_offset = match.start()
+            return chain_sequence, chain_id, mapping_offset
+
+    # No hits, raise an error.
+    raise SequenceNotInTemplateError(
+        'Could not find the template sequence in %s_%s. Template sequence: %s, '
+        'chain_to_seqres: %s' % (pdb_id, template_chain_id, template_sequence,
+                                 mmcif_object.chain_to_seqres))
+
+
+def _realign_pdb_template_to_query(
+    old_template_sequence: str,
+    template_chain_id: str,
+    mmcif_object: mmcif.MmcifObject,
+    old_mapping: Mapping[int, int],
+    kalign_binary_path: str,
+) -> Tuple[str, Mapping[int, int]]:
+    """Aligns template from the mmcif_object to the query.
+
+    In case PDB70 contains a different version of the template sequence, we need
+    to perform a realignment to the actual sequence that is in the mmCIF file.
+    This method performs such realignment, but returns the new sequence and
+    mapping only if the sequence in the mmCIF file is 90% identical to the old
+    sequence.
+
+    Note that the old_template_sequence comes from the hit, and contains only that
+    part of the chain that matches with the query while the new_template_sequence
+    is the full chain.
+
+    Args:
+        old_template_sequence: The template sequence that was returned by the PDB
+            template search (typically done using HHSearch).
+        template_chain_id: The template chain id was returned by the PDB template
+            search (typically done using HHSearch). This is used to find the right
+            chain in the mmcif_object chain_to_seqres mapping.
+        mmcif_object: A mmcif_object which holds the actual template data.
+        old_mapping: A mapping from the query sequence to the template sequence.
+            This mapping will be used to compute the new mapping from the query
+            sequence to the actual mmcif_object template sequence by aligning the
+            old_template_sequence and the actual template sequence.
+        kalign_binary_path: The path to a kalign executable.
+
+    Returns:
+        A tuple (new_template_sequence, new_query_to_template_mapping) where:
+        * new_template_sequence is the actual template sequence that was found in
+            the mmcif_object.
+        * new_query_to_template_mapping is the new mapping from the query to the
+            actual template found in the mmcif_object.
+
+    Raises:
+        QueryToTemplateAlignError:
+        * If there was an error thrown by the alignment tool.
+        * Or if the actual template sequence differs by more than 10% from the
+            old_template_sequence.
+    """
+    aligner = kalign.Kalign(binary_path=kalign_binary_path)
+    new_template_sequence = mmcif_object.chain_to_seqres.get(
+        template_chain_id, '')
+
+    # Sometimes the template chain id is unknown. But if there is only a single
+    # sequence within the mmcif_object, it is safe to assume it is that one.
+    if not new_template_sequence:
+        if len(mmcif_object.chain_to_seqres) == 1:
+            logging.info(
+                'Could not find %s in %s, but there is only 1 sequence, so '
+                'using that one.',
+                template_chain_id,
+                mmcif_object.file_id,
+            )
+            new_template_sequence = list(
+                mmcif_object.chain_to_seqres.values())[0]
+        else:
+            raise QueryToTemplateAlignError(
+                f'Could not find chain {template_chain_id} in {mmcif_object.file_id}. '
+                'If there are no mmCIF parsing errors, it is possible it was not a '
+                'protein chain.')
+
+    try:
+        parsed_a3m = parsers.parse_a3m(
+            aligner.align([old_template_sequence, new_template_sequence]))
+        old_aligned_template, new_aligned_template = parsed_a3m.sequences
+    except Exception as e:
+        raise QueryToTemplateAlignError(
+            'Could not align old template %s to template %s (%s_%s). Error: %s'
+            % (
+                old_template_sequence,
+                new_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                str(e),
+            ))
+
+    logging.info(
+        'Old aligned template: %s\nNew aligned template: %s',
+        old_aligned_template,
+        new_aligned_template,
+    )
+
+    old_to_new_template_mapping = {}
+    old_template_index = -1
+    new_template_index = -1
+    num_same = 0
+    for old_template_aa, new_template_aa in zip(old_aligned_template,
+                                                new_aligned_template):
+        if old_template_aa != '-':
+            old_template_index += 1
+        if new_template_aa != '-':
+            new_template_index += 1
+        if old_template_aa != '-' and new_template_aa != '-':
+            old_to_new_template_mapping[
+                old_template_index] = new_template_index
+            if old_template_aa == new_template_aa:
+                num_same += 1
+
+    # Require at least 90 % sequence identity wrt to the shorter of the sequences.
+    if (float(num_same)
+            / min(len(old_template_sequence), len(new_template_sequence))
+            <  # noqa W504
+            0.9):
+        raise QueryToTemplateAlignError(
+            'Insufficient similarity of the sequence in the database: %s to the '
+            'actual sequence in the mmCIF file %s_%s: %s. We require at least '
+            '90 %% similarity wrt to the shorter of the sequences. This is not a '
+            'problem unless you think this is a template that should be included.'
+            % (
+                old_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                new_template_sequence,
+            ))
+
+    new_query_to_template_mapping = {}
+    for query_index, old_template_index in old_mapping.items():
+        new_query_to_template_mapping[
+            query_index] = old_to_new_template_mapping.get(
+                old_template_index, -1)
+
+    new_template_sequence = new_template_sequence.replace('-', '')
+
+    return new_template_sequence, new_query_to_template_mapping
+
+
+def _check_residue_distances(all_positions: np.ndarray,
+                             all_positions_mask: np.ndarray,
+                             max_ca_ca_distance: float):
+    """Checks if the distance between unmasked neighbor residues is ok."""
+    ca_position = residue_constants.atom_order['CA']
+    prev_is_unmasked = False
+    prev_calpha = None
+    for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
+        this_is_unmasked = bool(mask[ca_position])
+        if this_is_unmasked:
+            this_calpha = coords[ca_position]
+            if prev_is_unmasked:
+                distance = np.linalg.norm(this_calpha - prev_calpha)
+                if distance > max_ca_ca_distance:
+                    raise CaDistanceError(
+                        'The distance between residues %d and %d is %f > limit %f.'
+                        % (i, i + 1, distance, max_ca_ca_distance))
+            prev_calpha = this_calpha
+        prev_is_unmasked = this_is_unmasked
+
+
+def _get_atom_positions(
+        mmcif_object: mmcif.MmcifObject, auth_chain_id: str,
+        max_ca_ca_distance: float) -> Tuple[np.ndarray, np.ndarray]:
+    """Gets atom positions and mask from a list of Biopython Residues."""
+    num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])
+
+    relevant_chains = [
+        c for c in mmcif_object.structure.get_chains() if c.id == auth_chain_id
+    ]
+    if len(relevant_chains) != 1:
+        raise MultipleChainsError(
+            f'Expected exactly one chain in structure with id {auth_chain_id}.'
+        )
+    chain = relevant_chains[0]
+
+    all_positions = np.zeros([num_res, residue_constants.atom_type_num, 3])
+    all_positions_mask = np.zeros([num_res, residue_constants.atom_type_num],
+                                  dtype=np.int64)
+    for res_index in range(num_res):
+        pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
+        mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
+        res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][
+            res_index]
+        if not res_at_position.is_missing:
+            res = chain[(
+                res_at_position.hetflag,
+                res_at_position.position.residue_number,
+                res_at_position.position.insertion_code,
+            )]
+            for atom in res.get_atoms():
+                atom_name = atom.get_name()
+                x, y, z = atom.get_coord()
+                if atom_name in residue_constants.atom_order.keys():
+                    pos[residue_constants.atom_order[atom_name]] = [x, y, z]
+                    mask[residue_constants.atom_order[atom_name]] = 1.0
+                elif atom_name.upper() == 'SE' and res.get_resname() == 'MSE':
+                    # Put the coordinates of the selenium atom in the sulphur column.
+                    pos[residue_constants.atom_order['SD']] = [x, y, z]
+                    mask[residue_constants.atom_order['SD']] = 1.0
+
+            # Fix naming errors in arginine residues where NH2 is incorrectly
+            # assigned to be closer to CD than NH1.
+            cd = residue_constants.atom_order['CD']
+            nh1 = residue_constants.atom_order['NH1']
+            nh2 = residue_constants.atom_order['NH2']
+            if (res.get_resname() == 'ARG'
+                    and all(mask[atom_index] for atom_index in (cd, nh1, nh2))
+                    and (np.linalg.norm(pos[nh1] - pos[cd]) >  # noqa W504
+                         np.linalg.norm(pos[nh2] - pos[cd]))):
+                pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
+                mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()
+
+        all_positions[res_index] = pos
+        all_positions_mask[res_index] = mask
+    _check_residue_distances(all_positions, all_positions_mask,
+                             max_ca_ca_distance)
+    return all_positions, all_positions_mask
+
+
+def _extract_template_features(
+    mmcif_object: mmcif.MmcifObject,
+    pdb_id: str,
+    mapping: Mapping[int, int],
+    template_sequence: str,
+    query_sequence: str,
+    template_chain_id: str,
+    kalign_binary_path: str,
+) -> Tuple[Dict[str, Any], Optional[str]]:
+    """Parses atom positions in the target structure and aligns with the query.
+
+    Atoms for each residue in the template structure are indexed to coincide
+    with their corresponding residue in the query sequence, according to the
+    alignment mapping provided.
+
+    Args:
+        mmcif_object: mmcif_parsing.MmcifObject representing the template.
+        pdb_id: PDB code for the template.
+        mapping: Dictionary mapping indices in the query sequence to indices in
+            the template sequence.
+        template_sequence: String describing the amino acid sequence for the
+            template protein.
+        query_sequence: String describing the amino acid sequence for the query
+            protein.
+        template_chain_id: String ID describing which chain in the structure proto
+            should be used.
+        kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+
+    Returns:
+        A tuple with:
+        * A dictionary containing the extra features derived from the template
+            protein structure.
+        * A warning message if the hit was realigned to the actual mmCIF sequence.
+            Otherwise None.
+
+    Raises:
+        NoChainsError: If the mmcif object doesn't contain any chains.
+        SequenceNotInTemplateError: If the given chain id / sequence can't
+            be found in the mmcif object.
+        QueryToTemplateAlignError: If the actual template in the mmCIF file
+            can't be aligned to the query.
+        NoAtomDataInTemplateError: If the mmcif object doesn't contain
+            atom positions.
+        TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
+            unmasked residues.
+    """
+    if mmcif_object is None or not mmcif_object.chain_to_seqres:
+        raise NoChainsError('No chains in PDB: %s_%s' %
+                            (pdb_id, template_chain_id))
+
+    warning = None
+    try:
+        seqres, chain_id, mapping_offset = _find_template_in_pdb(
+            template_chain_id=template_chain_id,
+            template_sequence=template_sequence,
+            mmcif_object=mmcif_object,
+        )
+    except SequenceNotInTemplateError:
+        # If PDB70 contains a different version of the template, we use the sequence
+        # from the mmcif_object.
+        chain_id = template_chain_id
+        warning = (
+            f'The exact sequence {template_sequence} was not found in '
+            f'{pdb_id}_{chain_id}. Realigning the template to the actual sequence.'
+        )
+        logging.warning(warning)
+        # This throws an exception if it fails to realign the hit.
+        seqres, mapping = _realign_pdb_template_to_query(
+            old_template_sequence=template_sequence,
+            template_chain_id=template_chain_id,
+            mmcif_object=mmcif_object,
+            old_mapping=mapping,
+            kalign_binary_path=kalign_binary_path,
+        )
+        logging.info(
+            'Sequence in %s_%s: %s successfully realigned to %s',
+            pdb_id,
+            chain_id,
+            template_sequence,
+            seqres,
+        )
+        # The template sequence changed.
+        template_sequence = seqres
+        # No mapping offset, the query is aligned to the actual sequence.
+        mapping_offset = 0
+
+    try:
+        # Essentially set to infinity - we don't want to reject templates unless
+        # they're really really bad.
+        all_atom_positions, all_atom_mask = _get_atom_positions(
+            mmcif_object, chain_id, max_ca_ca_distance=150.0)
+    except (CaDistanceError, KeyError) as ex:
+        raise NoAtomDataInTemplateError('Could not get atom data (%s_%s): %s' %
+                                        (pdb_id, chain_id, str(ex))) from ex
+
+    all_atom_positions = np.split(all_atom_positions,
+                                  all_atom_positions.shape[0])
+    all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
+
+    output_templates_sequence = []
+    templates_all_atom_positions = []
+    templates_all_atom_masks = []
+
+    for _ in query_sequence:
+        # Residues in the query_sequence that are not in the template_sequence:
+        templates_all_atom_positions.append(
+            np.zeros((residue_constants.atom_type_num, 3)))
+        templates_all_atom_masks.append(
+            np.zeros(residue_constants.atom_type_num))
+        output_templates_sequence.append('-')
+
+    for k, v in mapping.items():
+        template_index = v + mapping_offset
+        templates_all_atom_positions[k] = all_atom_positions[template_index][0]
+        templates_all_atom_masks[k] = all_atom_masks[template_index][0]
+        output_templates_sequence[k] = template_sequence[v]
+
+    # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
+    if np.sum(templates_all_atom_masks) < 5:
+        raise TemplateAtomMaskAllZerosError(
+            'Template all atom mask was all zeros: %s_%s. Residue range: %d-%d'
+            % (
+                pdb_id,
+                chain_id,
+                min(mapping.values()) + mapping_offset,
+                max(mapping.values()) + mapping_offset,
+            ))
+
+    output_templates_sequence = ''.join(output_templates_sequence)
+
+    templates_aatype = residue_constants.sequence_to_onehot(
+        output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID)
+
+    return (
+        {
+            'template_all_atom_positions':
+            np.array(templates_all_atom_positions),
+            'template_all_atom_mask': np.array(templates_all_atom_masks),
+            'template_sequence': output_templates_sequence.encode(),
+            'template_aatype': np.array(templates_aatype),
+            'template_domain_names': f'{pdb_id.lower()}_{chain_id}'.encode(),
+        },
+        warning,
+    )
+
+
+def _build_query_to_hit_index_mapping(
+    hit_query_sequence: str,
+    hit_sequence: str,
+    indices_hit: Sequence[int],
+    indices_query: Sequence[int],
+    original_query_sequence: str,
+) -> Mapping[int, int]:
+    """Gets mapping from indices in original query sequence to indices in the hit.
+
+    hit_query_sequence and hit_sequence are two aligned sequences containing gap
+    characters. hit_query_sequence contains only the part of the original query
+    sequence that matched the hit. When interpreting the indices from the .hhr, we
+    need to correct for this to recover a mapping from original query sequence to
+    the hit sequence.
+
+    Args:
+        hit_query_sequence: The portion of the query sequence that is in the .hhr
+            hit
+        hit_sequence: The portion of the hit sequence that is in the .hhr
+        indices_hit: The indices for each aminoacid relative to the hit sequence
+        indices_query: The indices for each aminoacid relative to the original query
+            sequence
+        original_query_sequence: String describing the original query sequence.
+
+    Returns:
+        Dictionary with indices in the original query sequence as keys and indices
+        in the hit sequence as values.
+    """
+    # If the hit is empty (no aligned residues), return empty mapping
+    if not hit_query_sequence:
+        return {}
+
+    # Remove gaps and find the offset of hit.query relative to original query.
+    hhsearch_query_sequence = hit_query_sequence.replace('-', '')
+    hit_sequence = hit_sequence.replace('-', '')
+    hhsearch_query_offset = original_query_sequence.find(
+        hhsearch_query_sequence)
+
+    # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
+    min_idx = min(x for x in indices_hit if x > -1)
+    fixed_indices_hit = [x - min_idx if x > -1 else -1 for x in indices_hit]
+
+    min_idx = min(x for x in indices_query if x > -1)
+    fixed_indices_query = [
+        x - min_idx if x > -1 else -1 for x in indices_query
+    ]
+
+    # Zip the corrected indices, ignore case where both seqs have gap characters.
+    mapping = {}
+    for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
+        if q_t != -1 and q_i != -1:
+            if q_t >= len(hit_sequence) or q_i + hhsearch_query_offset >= len(
+                    original_query_sequence):
+                continue
+            mapping[q_i + hhsearch_query_offset] = q_t
+
+    return mapping
+
+
+@dataclasses.dataclass(frozen=True)
+class SingleHitResult:
+    features: Optional[Mapping[str, Any]]
+    error: Optional[str]
+    warning: Optional[str]
+
+
+@functools.lru_cache(16, typed=False)
+def _read_file(path):
+    with open(path, 'r') as f:
+        file_data = f.read()
+    return file_data
+
+
+def _process_single_hit(
+    query_sequence: str,
+    hit: parsers.TemplateHit,
+    mmcif_dir: str,
+    max_template_date: datetime.datetime,
+    release_dates: Mapping[str, datetime.datetime],
+    obsolete_pdbs: Mapping[str, Optional[str]],
+    kalign_binary_path: str,
+    strict_error_check: bool = False,
+) -> SingleHitResult:
+    """Tries to extract template features from a single HHSearch hit."""
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+
+    # This hit has been removed (obsoleted) from PDB, skip it.
+    if hit_pdb_code in obsolete_pdbs and obsolete_pdbs[hit_pdb_code] is None:
+        return SingleHitResult(
+            features=None,
+            error=None,
+            warning=f'Hit {hit_pdb_code} is obsolete.')
+
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+
+    # Pass hit_pdb_code since it might have changed due to the pdb being obsolete.
+    try:
+        _assess_hhsearch_hit(
+            hit=hit,
+            hit_pdb_code=hit_pdb_code,
+            query_sequence=query_sequence,
+            release_dates=release_dates,
+            release_date_cutoff=max_template_date,
+        )
+    except PrefilterError as e:
+        msg = f'hit {hit_pdb_code}_{hit_chain_id} did not pass prefilter: {str(e)}'
+        logging.info(msg)
+        if strict_error_check and isinstance(e, (DateError, DuplicateError)):
+            # In strict mode we treat some prefilter cases as errors.
+            return SingleHitResult(features=None, error=msg, warning=None)
+
+        return SingleHitResult(features=None, error=None, warning=None)
+
+    mapping = _build_query_to_hit_index_mapping(hit.query, hit.hit_sequence,
+                                                hit.indices_hit,
+                                                hit.indices_query,
+                                                query_sequence)
+
+    # The mapping is from the query to the actual hit sequence, so we need to
+    # remove gaps (which regardless have a missing confidence score).
+    template_sequence = hit.hit_sequence.replace('-', '')
+
+    cif_path = os.path.join(mmcif_dir, hit_pdb_code + '.cif')
+    logging.debug(
+        'Reading PDB entry from %s. Query: %s, template: %s',
+        cif_path,
+        query_sequence,
+        template_sequence,
+    )
+    # Fail if we can't find the mmCIF file.
+    cif_string = _read_file(cif_path)
+
+    parsing_result = mmcif.parse(file_id=hit_pdb_code, mmcif_string=cif_string)
+
+    if parsing_result.mmcif_object is not None:
+        hit_release_date = datetime.datetime.strptime(
+            parsing_result.mmcif_object.header['release_date'], '%Y-%m-%d')
+        if hit_release_date > max_template_date:
+            error = 'Template %s date (%s) > max template date (%s).' % (
+                hit_pdb_code,
+                hit_release_date,
+                max_template_date,
+            )
+            if strict_error_check:
+                return SingleHitResult(
+                    features=None, error=error, warning=None)
+            else:
+                logging.debug(error)
+                return SingleHitResult(features=None, error=None, warning=None)
+
+    try:
+        features, realign_warning = _extract_template_features(
+            mmcif_object=parsing_result.mmcif_object,
+            pdb_id=hit_pdb_code,
+            mapping=mapping,
+            template_sequence=template_sequence,
+            query_sequence=query_sequence,
+            template_chain_id=hit_chain_id,
+            kalign_binary_path=kalign_binary_path,
+        )
+        if hit.sum_probs is None:
+            features['template_sum_probs'] = [0]
+        else:
+            features['template_sum_probs'] = [hit.sum_probs]
+
+        # It is possible there were some errors when parsing the other chains in the
+        # mmCIF file, but the template features for the chain we want were still
+        # computed. In such case the mmCIF parsing errors are not relevant.
+        return SingleHitResult(
+            features=features, error=None, warning=realign_warning)
+    except (
+            NoChainsError,
+            NoAtomDataInTemplateError,
+            TemplateAtomMaskAllZerosError,
+    ) as e:
+        # These 3 errors indicate missing mmCIF experimental data rather than a
+        # problem with the template search, so turn them into warnings.
+        warning = (
+            '%s_%s (sum_probs: %s, rank: %s): feature extracting errors: '
+            '%s, mmCIF parsing errors: %s' % (
+                hit_pdb_code,
+                hit_chain_id,
+                hit.sum_probs,
+                hit.index,
+                str(e),
+                parsing_result.errors,
+            ))
+        if strict_error_check:
+            return SingleHitResult(features=None, error=warning, warning=None)
+        else:
+            return SingleHitResult(features=None, error=None, warning=warning)
+    except Error as e:
+        error = (
+            '%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
+            '%s, mmCIF parsing errors: %s' % (
+                hit_pdb_code,
+                hit_chain_id,
+                hit.sum_probs,
+                hit.index,
+                str(e),
+                parsing_result.errors,
+            ))
+        return SingleHitResult(features=None, error=error, warning=None)
+
+
+@dataclasses.dataclass(frozen=True)
+class TemplateSearchResult:
+    features: Mapping[str, Any]
+    errors: Sequence[str]
+    warnings: Sequence[str]
+
+
+class TemplateHitFeaturizer(abc.ABC):
+    """An abstract base class for turning template hits to template features."""
+
+    def __init__(
+        self,
+        mmcif_dir: str,
+        max_template_date: str,
+        max_hits: int,
+        kalign_binary_path: str,
+        release_dates_path: Optional[str],
+        obsolete_pdbs_path: Optional[str],
+        strict_error_check: bool = False,
+    ):
+        """Initializes the Template Search.
+
+        Args:
+            mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
+                is found by HHSearch, this directory is used to retrieve the template
+                data.
+            max_template_date: The maximum date permitted for template structures. No
+                template with date higher than this date will be returned. In ISO8601
+                date format, YYYY-MM-DD.
+            max_hits: The maximum number of templates that will be returned.
+            kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+            release_dates_path: An optional path to a file with a mapping from PDB IDs
+                to their release dates. Thanks to this we don't have to redundantly
+                parse mmCIF files to get that information.
+            obsolete_pdbs_path: An optional path to a file containing a mapping from
+                obsolete PDB IDs to the PDB IDs of their replacements.
+            strict_error_check: If True, then the following will be treated as errors:
+                * If any template date is after the max_template_date.
+                * If any template has identical PDB ID to the query.
+                * If any template is a duplicate of the query.
+                * Any feature computation errors.
+        """
+        self._mmcif_dir = mmcif_dir
+        if not glob.glob(os.path.join(self._mmcif_dir, '*.cif')):
+            logging.error('Could not find CIFs in %s', self._mmcif_dir)
+            raise ValueError(f'Could not find CIFs in {self._mmcif_dir}')
+
+        try:
+            self._max_template_date = datetime.datetime.strptime(
+                max_template_date, '%Y-%m-%d')
+        except ValueError:
+            raise ValueError(
+                'max_template_date must be set and have format YYYY-MM-DD.')
+        self._max_hits = max_hits
+        self._kalign_binary_path = kalign_binary_path
+        self._strict_error_check = strict_error_check
+
+        if release_dates_path:
+            logging.info('Using precomputed release dates %s.',
+                         release_dates_path)
+            self._release_dates = _parse_release_dates(release_dates_path)
+        else:
+            self._release_dates = {}
+
+        if obsolete_pdbs_path:
+            logging.info('Using precomputed obsolete pdbs %s.',
+                         obsolete_pdbs_path)
+            self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
+        else:
+            self._obsolete_pdbs = {}
+
+    @abc.abstractmethod
+    def get_templates(
+            self, query_sequence: str,
+            hits: Sequence[parsers.TemplateHit]) -> TemplateSearchResult:
+        """Computes the templates for given query sequence."""
+
+
+class HhsearchHitFeaturizer(TemplateHitFeaturizer):
+    """A class for turning a3m hits from hhsearch to template features."""
+
+    def get_templates(
+            self, query_sequence: str,
+            hits: Sequence[parsers.TemplateHit]) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info('Searching for template for: %s', query_sequence)
+
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+
+        num_hits = 0
+        errors = []
+        warnings = []
+
+        for hit in sorted(hits, key=lambda x: x.sum_probs, reverse=True):
+            # We got all the templates we wanted, stop processing hits.
+            if num_hits >= self._max_hits:
+                break
+
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path,
+            )
+
+            if result.error:
+                errors.append(result.error)
+
+            # There could be an error even if there are some results, e.g. thrown by
+            # other unparsable chains in the same mmCIF file.
+            if result.warning:
+                warnings.append(result.warning)
+
+            if result.features is None:
+                logging.info(
+                    'Skipped invalid hit %s, error: %s, warning: %s',
+                    hit.name,
+                    result.error,
+                    result.warning,
+                )
+            else:
+                # Increment the hit counter, since we got features out of this hit.
+                num_hits += 1
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+
+        for name in template_features:
+            if num_hits > 0:
+                template_features[name] = np.stack(
+                    template_features[name],
+                    axis=0).astype(TEMPLATE_FEATURES[name])
+            else:
+                # Make sure the feature has correct dtype even if empty.
+                template_features[name] = np.array(
+                    [], dtype=TEMPLATE_FEATURES[name])
+
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings)
+
+
+class HmmsearchHitFeaturizer(TemplateHitFeaturizer):
+    """A class for turning a3m hits from hmmsearch to template features."""
+
+    def get_templates(
+            self, query_sequence: str,
+            hits: Sequence[parsers.TemplateHit]) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info('Searching for template for: %s', query_sequence)
+
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+
+        already_seen = set()
+        errors = []
+        warnings = []
+
+        if not hits or hits[0].sum_probs is None:
+            sorted_hits = hits
+        else:
+            sorted_hits = sorted(hits, key=lambda x: x.sum_probs, reverse=True)
+
+        for hit in sorted_hits:
+            # We got all the templates we wanted, stop processing hits.
+            if len(already_seen) >= self._max_hits:
+                break
+
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path,
+            )
+
+            if result.error:
+                errors.append(result.error)
+
+            # There could be an error even if there are some results, e.g. thrown by
+            # other unparsable chains in the same mmCIF file.
+            if result.warning:
+                warnings.append(result.warning)
+
+            if result.features is None:
+                logging.debug(
+                    'Skipped invalid hit %s, error: %s, warning: %s',
+                    hit.name,
+                    result.error,
+                    result.warning,
+                )
+            else:
+                already_seen_key = result.features['template_sequence']
+                if already_seen_key in already_seen:
+                    continue
+                # Increment the hit counter, since we got features out of this hit.
+                already_seen.add(already_seen_key)
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+
+        if already_seen:
+            for name in template_features:
+                template_features[name] = np.stack(
+                    template_features[name],
+                    axis=0).astype(TEMPLATE_FEATURES[name])
+        else:
+            num_res = len(query_sequence)
+            # Construct a default template with all zeros.
+            template_features = {
+                'template_aatype':
+                np.zeros(
+                    (1, num_res, len(
+                        residue_constants.restypes_with_x_and_gap)),
+                    np.float32,
+                ),
+                'template_all_atom_mask':
+                np.zeros((1, num_res, residue_constants.atom_type_num),
+                         np.float32),
+                'template_all_atom_positions':
+                np.zeros((1, num_res, residue_constants.atom_type_num, 3),
+                         np.float32),
+                'template_domain_names':
+                np.array([''.encode()], dtype=np.object),
+                'template_sequence':
+                np.array([''.encode()], dtype=np.object),
+                'template_sum_probs':
+                np.array([0], dtype=np.float32),
+            }
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings)
diff --git a/modelscope/models/science/unifold/msa/tools/__init__.py b/modelscope/models/science/unifold/msa/tools/__init__.py
new file mode 100644
index 00000000..903d0979
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python wrappers for third party tools."""
diff --git a/modelscope/models/science/unifold/msa/tools/hhblits.py b/modelscope/models/science/unifold/msa/tools/hhblits.py
new file mode 100644
index 00000000..ee442e39
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/hhblits.py
@@ -0,0 +1,170 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run HHblits from Python."""
+
+import glob
+import os
+import subprocess
+from typing import Any, List, Mapping, Optional, Sequence
+
+from absl import logging
+
+from . import utils
+
+_HHBLITS_DEFAULT_P = 20
+_HHBLITS_DEFAULT_Z = 500
+
+
+class HHBlits:
+    """Python wrapper of the HHblits binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 4,
+        n_iter: int = 3,
+        e_value: float = 0.001,
+        maxseq: int = 1_000_000,
+        realign_max: int = 100_000,
+        maxfilt: int = 100_000,
+        min_prefilter_hits: int = 1000,
+        all_seqs: bool = False,
+        alt: Optional[int] = None,
+        p: int = _HHBLITS_DEFAULT_P,
+        z: int = _HHBLITS_DEFAULT_Z,
+    ):
+        """Initializes the Python HHblits wrapper.
+
+        Args:
+            binary_path: The path to the HHblits executable.
+            databases: A sequence of HHblits database paths. This should be the
+                common prefix for the database files (i.e. up to but not including
+                _hhm.ffindex etc.)
+            n_cpu: The number of CPUs to give HHblits.
+            n_iter: The number of HHblits iterations.
+            e_value: The E-value, see HHblits docs for more details.
+            maxseq: The maximum number of rows in an input alignment. Note that this
+                parameter is only supported in HHBlits version 3.1 and higher.
+            realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
+            maxfilt: Max number of hits allowed to pass the 2nd prefilter.
+                HHblits default: 20000.
+            min_prefilter_hits: Min number of hits to pass prefilter.
+                HHblits default: 100.
+            all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
+                HHblits default: False.
+            alt: Show up to this many alternative alignments.
+            p: Minimum Prob for a hit to be included in the output hhr file.
+                HHblits default: 20.
+            z: Hard cap on number of hits reported in the hhr file.
+                HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
+
+        Raises:
+            RuntimeError: If HHblits binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+
+        for database_path in self.databases:
+            if not glob.glob(database_path + '_*'):
+                logging.error('Could not find HHBlits database %s',
+                              database_path)
+                raise ValueError(
+                    f'Could not find HHBlits database {database_path}')
+
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.maxseq = maxseq
+        self.realign_max = realign_max
+        self.maxfilt = maxfilt
+        self.min_prefilter_hits = min_prefilter_hits
+        self.all_seqs = all_seqs
+        self.alt = alt
+        self.p = p
+        self.z = z
+
+    def query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:
+        """Queries the database using HHblits."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            a3m_path = os.path.join(query_tmp_dir, 'output.a3m')
+
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append('-d')
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                '-i',
+                input_fasta_path,
+                '-cpu',
+                str(self.n_cpu),
+                '-oa3m',
+                a3m_path,
+                '-o',
+                '/dev/null',
+                '-n',
+                str(self.n_iter),
+                '-e',
+                str(self.e_value),
+                '-maxseq',
+                str(self.maxseq),
+                '-realign_max',
+                str(self.realign_max),
+                '-maxfilt',
+                str(self.maxfilt),
+                '-min_prefilter_hits',
+                str(self.min_prefilter_hits),
+            ]
+            if self.all_seqs:
+                cmd += ['-all']
+            if self.alt:
+                cmd += ['-alt', str(self.alt)]
+            if self.p != _HHBLITS_DEFAULT_P:
+                cmd += ['-p', str(self.p)]
+            if self.z != _HHBLITS_DEFAULT_Z:
+                cmd += ['-Z', str(self.z)]
+            cmd += db_cmd
+
+            logging.info('Launching subprocess "%s"', ' '.join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            with utils.timing('HHblits query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                # Logs have a 15k character limit, so log HHblits error line by line.
+                logging.error('HHblits failed. HHblits stderr begin:')
+                for error_line in stderr.decode('utf-8').splitlines():
+                    if error_line.strip():
+                        logging.error(error_line.strip())
+                logging.error('HHblits stderr end')
+                raise RuntimeError(
+                    'HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' %
+                    (stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
+
+            with open(a3m_path) as f:
+                a3m = f.read()
+
+        raw_output = dict(
+            a3m=a3m,
+            output=stdout,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return [raw_output]
diff --git a/modelscope/models/science/unifold/msa/tools/hhsearch.py b/modelscope/models/science/unifold/msa/tools/hhsearch.py
new file mode 100644
index 00000000..ac7f3b55
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/hhsearch.py
@@ -0,0 +1,111 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run HHsearch from Python."""
+
+import glob
+import os
+import subprocess
+from typing import Sequence
+
+from absl import logging
+
+from modelscope.models.science.unifold.msa import parsers
+from . import utils
+
+
+class HHSearch:
+    """Python wrapper of the HHsearch binary."""
+
+    def __init__(self,
+                 *,
+                 binary_path: str,
+                 databases: Sequence[str],
+                 maxseq: int = 1_000_000):
+        """Initializes the Python HHsearch wrapper.
+
+        Args:
+            binary_path: The path to the HHsearch executable.
+            databases: A sequence of HHsearch database paths. This should be the
+                common prefix for the database files (i.e. up to but not including
+                _hhm.ffindex etc.)
+            maxseq: The maximum number of rows in an input alignment. Note that this
+                parameter is only supported in HHBlits version 3.1 and higher.
+
+        Raises:
+            RuntimeError: If HHsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        self.maxseq = maxseq
+
+        for database_path in self.databases:
+            if not glob.glob(database_path + '_*'):
+                logging.error('Could not find HHsearch database %s',
+                              database_path)
+                raise ValueError(
+                    f'Could not find HHsearch database {database_path}')
+
+    @property
+    def output_format(self) -> str:
+        return 'hhr'
+
+    @property
+    def input_format(self) -> str:
+        return 'a3m'
+
+    def query(self, a3m: str) -> str:
+        """Queries the database using HHsearch using a given a3m."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_path = os.path.join(query_tmp_dir, 'query.a3m')
+            hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
+            with open(input_path, 'w') as f:
+                f.write(a3m)
+
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append('-d')
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                '-i',
+                input_path,
+                '-o',
+                hhr_path,
+                '-maxseq',
+                str(self.maxseq),
+            ] + db_cmd
+
+            logging.info('Launching subprocess "%s"', ' '.join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            with utils.timing('HHsearch query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                # Stderr is truncated to prevent proto size errors in Beam.
+                raise RuntimeError(
+                    'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' %
+                    (stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))
+
+            with open(hhr_path) as f:
+                hhr = f.read()
+        return hhr
+
+    def get_template_hits(
+            self, output_string: str,
+            input_sequence: str) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool."""
+        del input_sequence  # Used by hmmseach but not needed for hhsearch.
+        return parsers.parse_hhr(output_string)
diff --git a/modelscope/models/science/unifold/msa/tools/hmmbuild.py b/modelscope/models/science/unifold/msa/tools/hmmbuild.py
new file mode 100644
index 00000000..84f205d6
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/hmmbuild.py
@@ -0,0 +1,143 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for hmmbuild - construct HMM profiles from MSA."""
+
+import os
+import re
+import subprocess
+
+from absl import logging
+
+from . import utils
+
+
+class Hmmbuild(object):
+    """Python wrapper of the hmmbuild binary."""
+
+    def __init__(self, *, binary_path: str, singlemx: bool = False):
+        """Initializes the Python hmmbuild wrapper.
+
+        Args:
+            binary_path: The path to the hmmbuild executable.
+            singlemx: Whether to use --singlemx flag. If True, it forces HMMBuild to
+                just use a common substitution score matrix.
+
+        Raises:
+            RuntimeError: If hmmbuild binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.singlemx = singlemx
+
+    def build_profile_from_sto(self,
+                               sto: str,
+                               model_construction='fast') -> str:
+        """Builds a HHM for the aligned sequences given as an A3M string.
+
+        Args:
+            sto: A string with the aligned sequences in the Stockholm format.
+            model_construction: Whether to use reference annotation in the msa to
+                determine consensus columns ('hand') or default ('fast').
+
+        Returns:
+            A string with the profile in the HMM format.
+
+        Raises:
+            RuntimeError: If hmmbuild fails.
+        """
+        return self._build_profile(sto, model_construction=model_construction)
+
+    def build_profile_from_a3m(self, a3m: str) -> str:
+        """Builds a HHM for the aligned sequences given as an A3M string.
+
+        Args:
+            a3m: A string with the aligned sequences in the A3M format.
+
+        Returns:
+            A string with the profile in the HMM format.
+
+        Raises:
+            RuntimeError: If hmmbuild fails.
+        """
+        lines = []
+        for line in a3m.splitlines():
+            if not line.startswith('>'):
+                line = re.sub('[a-z]+', '', line)  # Remove inserted residues.
+            lines.append(line + '\n')
+        msa = ''.join(lines)
+        return self._build_profile(msa, model_construction='fast')
+
+    def _build_profile(self,
+                       msa: str,
+                       model_construction: str = 'fast') -> str:
+        """Builds a HMM for the aligned sequences given as an MSA string.
+
+        Args:
+            msa: A string with the aligned sequences, in A3M or STO format.
+            model_construction: Whether to use reference annotation in the msa to
+                determine consensus columns ('hand') or default ('fast').
+
+        Returns:
+            A string with the profile in the HMM format.
+
+        Raises:
+            RuntimeError: If hmmbuild fails.
+            ValueError: If unspecified arguments are provided.
+        """
+        if model_construction not in {'hand', 'fast'}:
+            raise ValueError(
+                f'Invalid model_construction {model_construction} - only'
+                'hand and fast supported.')
+
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_query = os.path.join(query_tmp_dir, 'query.msa')
+            output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm')
+
+            with open(input_query, 'w') as f:
+                f.write(msa)
+
+            cmd = [self.binary_path]
+            # If adding flags, we have to do so before the output and input:
+
+            if model_construction == 'hand':
+                cmd.append(f'--{model_construction}')
+            if self.singlemx:
+                cmd.append('--singlemx')
+            cmd.extend([
+                '--amino',
+                output_hmm_path,
+                input_query,
+            ])
+
+            logging.info('Launching subprocess %s', cmd)
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            with utils.timing('hmmbuild query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    'hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
+                    stdout.decode('utf-8'),
+                    stderr.decode('utf-8'),
+                )
+
+            if retcode:
+                raise RuntimeError(
+                    'hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n' %
+                    (stdout.decode('utf-8'), stderr.decode('utf-8')))
+
+            with open(output_hmm_path, encoding='utf-8') as f:
+                hmm = f.read()
+
+        return hmm
diff --git a/modelscope/models/science/unifold/msa/tools/hmmsearch.py b/modelscope/models/science/unifold/msa/tools/hmmsearch.py
new file mode 100644
index 00000000..445970ca
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/hmmsearch.py
@@ -0,0 +1,146 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for hmmsearch - search profile against a sequence db."""
+
+import os
+import subprocess
+from typing import Optional, Sequence
+
+from absl import logging
+
+from modelscope.models.science.unifold.msa import parsers
+from . import hmmbuild, utils
+
+
+class Hmmsearch(object):
+    """Python wrapper of the hmmsearch binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        hmmbuild_binary_path: str,
+        database_path: str,
+        flags: Optional[Sequence[str]] = None,
+    ):
+        """Initializes the Python hmmsearch wrapper.
+
+        Args:
+            binary_path: The path to the hmmsearch executable.
+            hmmbuild_binary_path: The path to the hmmbuild executable. Used to build
+                an hmm from an input a3m.
+            database_path: The path to the hmmsearch database (FASTA format).
+            flags: List of flags to be used by hmmsearch.
+
+        Raises:
+            RuntimeError: If hmmsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.hmmbuild_runner = hmmbuild.Hmmbuild(
+            binary_path=hmmbuild_binary_path)
+        self.database_path = database_path
+        if flags is None:
+            # Default hmmsearch run settings.
+            flags = [
+                '--F1',
+                '0.1',
+                '--F2',
+                '0.1',
+                '--F3',
+                '0.1',
+                '--incE',
+                '100',
+                '-E',
+                '100',
+                '--domE',
+                '100',
+                '--incdomE',
+                '100',
+            ]
+        self.flags = flags
+
+        if not os.path.exists(self.database_path):
+            logging.error('Could not find hmmsearch database %s',
+                          database_path)
+            raise ValueError(
+                f'Could not find hmmsearch database {database_path}')
+
+    @property
+    def output_format(self) -> str:
+        return 'sto'
+
+    @property
+    def input_format(self) -> str:
+        return 'sto'
+
+    def query(self, msa_sto: str) -> str:
+        """Queries the database using hmmsearch using a given stockholm msa."""
+        hmm = self.hmmbuild_runner.build_profile_from_sto(
+            msa_sto, model_construction='hand')
+        return self.query_with_hmm(hmm)
+
+    def query_with_hmm(self, hmm: str) -> str:
+        """Queries the database using hmmsearch using a given hmm."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            hmm_input_path = os.path.join(query_tmp_dir, 'query.hmm')
+            out_path = os.path.join(query_tmp_dir, 'output.sto')
+            with open(hmm_input_path, 'w') as f:
+                f.write(hmm)
+
+            cmd = [
+                self.binary_path,
+                '--noali',  # Don't include the alignment in stdout.
+                '--cpu',
+                '8',
+            ]
+            # If adding flags, we have to do so before the output and input:
+            if self.flags:
+                cmd.extend(self.flags)
+            cmd.extend([
+                '-A',
+                out_path,
+                hmm_input_path,
+                self.database_path,
+            ])
+
+            logging.info('Launching sub-process %s', cmd)
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            with utils.timing(
+                    f'hmmsearch ({os.path.basename(self.database_path)}) query'
+            ):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                raise RuntimeError(
+                    'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' %
+                    (stdout.decode('utf-8'), stderr.decode('utf-8')))
+
+            with open(out_path) as f:
+                out_msa = f.read()
+
+        return out_msa
+
+    def get_template_hits(
+            self, output_string: str,
+            input_sequence: str) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool."""
+        a3m_string = parsers.convert_stockholm_to_a3m(
+            output_string, remove_first_row_gaps=False)
+        template_hits = parsers.parse_hmmsearch_a3m(
+            query_sequence=input_sequence,
+            a3m_string=a3m_string,
+            skip_first=False)
+        return template_hits
diff --git a/modelscope/models/science/unifold/msa/tools/jackhmmer.py b/modelscope/models/science/unifold/msa/tools/jackhmmer.py
new file mode 100644
index 00000000..3e29eec9
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/jackhmmer.py
@@ -0,0 +1,224 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library to run Jackhmmer from Python."""
+
+import glob
+import os
+import subprocess
+from concurrent import futures
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+
+from absl import logging
+
+from . import utils
+
+
+class Jackhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        database_path: str,
+        n_cpu: int = 8,
+        n_iter: int = 1,
+        e_value: float = 0.0001,
+        z_value: Optional[int] = None,
+        get_tblout: bool = False,
+        filter_f1: float = 0.0005,
+        filter_f2: float = 0.00005,
+        filter_f3: float = 0.0000005,
+        incdom_e: Optional[float] = None,
+        dom_e: Optional[float] = None,
+        num_streamed_chunks: Optional[int] = None,
+        streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Jackhmmer wrapper.
+
+        Args:
+            binary_path: The path to the jackhmmer executable.
+            database_path: The path to the jackhmmer database (FASTA format).
+            n_cpu: The number of CPUs to give Jackhmmer.
+            n_iter: The number of Jackhmmer iterations.
+            e_value: The E-value, see Jackhmmer docs for more details.
+            z_value: The Z-value, see Jackhmmer docs for more details.
+            get_tblout: Whether to save tblout string.
+            filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+            filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+            filter_f3: Forward pre-filter, set to >1.0 to turn off.
+            incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+                round.
+            dom_e: Domain e-value criteria for inclusion in tblout.
+            num_streamed_chunks: Number of database chunks to stream over.
+            streaming_callback: Callback function run after each chunk iteration with
+                the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+
+        if not os.path.exists(
+                self.database_path) and num_streamed_chunks is None:
+            logging.error('Could not find Jackhmmer database %s',
+                          database_path)
+            raise ValueError(
+                f'Could not find Jackhmmer database {database_path}')
+
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.z_value = z_value
+        self.filter_f1 = filter_f1
+        self.filter_f2 = filter_f2
+        self.filter_f3 = filter_f3
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+
+    def _query_chunk(self, input_fasta_path: str,
+                     database_path: str) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, 'output.sto')
+
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.    They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                '-o',
+                '/dev/null',
+                '-A',
+                sto_path,
+                '--noali',
+                '--F1',
+                str(self.filter_f1),
+                '--F2',
+                str(self.filter_f2),
+                '--F3',
+                str(self.filter_f3),
+                '--incE',
+                str(self.e_value),
+                # Report only sequences with E-values <= x in per-sequence output.
+                '-E',
+                str(self.e_value),
+                '--cpu',
+                str(self.n_cpu),
+                '-N',
+                str(self.n_iter),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, 'tblout.txt')
+                cmd_flags.extend(['--tblout', tblout_path])
+
+            if self.z_value:
+                cmd_flags.extend(['-Z', str(self.z_value)])
+
+            if self.dom_e is not None:
+                cmd_flags.extend(['--domE', str(self.dom_e)])
+
+            if self.incdom_e is not None:
+                cmd_flags.extend(['--incdomE', str(self.incdom_e)])
+
+            cmd = [self.binary_path
+                   ] + cmd_flags + [input_fasta_path, database_path]
+
+            logging.info('Launching subprocess "%s"', ' '.join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            with utils.timing(
+                    f'Jackhmmer ({os.path.basename(database_path)}) query'):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                raise RuntimeError('Jackhmmer failed\nstderr:\n%s\n'
+                                   % stderr.decode('utf-8'))
+
+            # Get e-values for each target name
+            tbl = ''
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+
+            with open(sto_path) as f:
+                sto = f.read()
+
+        raw_output = dict(
+            sto=sto,
+            tbl=tbl,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value)
+
+        return raw_output
+
+    def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
+        """Queries the database using Jackhmmer."""
+        if self.num_streamed_chunks is None:
+            return [self._query_chunk(input_fasta_path, self.database_path)]
+
+        db_basename = os.path.basename(self.database_path)
+
+        def db_remote_chunk(db_idx):
+            return f'{self.database_path}.{db_idx}'
+
+        def db_local_chunk(db_idx):
+            return f'/tmp/ramdisk/{db_basename}.{db_idx}'
+
+        # db_remote_chunk = lambda db_idx: f'{self.database_path}.{db_idx}'
+        # db_local_chunk = lambda db_idx: f'/tmp/ramdisk/{db_basename}.{db_idx}'
+
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk('[0-9]*')):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f'OSError while deleting {f}')
+
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_output = []
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(request.urlretrieve,
+                                             db_remote_chunk(i),
+                                             db_local_chunk(i))
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+
+                # Run Jackhmmer with the chunk
+                future.result()
+                chunked_output.append(
+                    self._query_chunk(input_fasta_path, db_local_chunk(i)))
+
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                # Do not set next_future for the last chunk so that this works even for
+                # databases with only 1 chunk.
+                if i < self.num_streamed_chunks:
+                    future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_output
diff --git a/modelscope/models/science/unifold/msa/tools/kalign.py b/modelscope/models/science/unifold/msa/tools/kalign.py
new file mode 100644
index 00000000..1ea997fa
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/kalign.py
@@ -0,0 +1,110 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Python wrapper for Kalign."""
+import os
+import subprocess
+from typing import Sequence
+
+from absl import logging
+
+from . import utils
+
+
+def _to_a3m(sequences: Sequence[str]) -> str:
+    """Converts sequences to an a3m file."""
+    names = ['sequence %d' % i for i in range(1, len(sequences) + 1)]
+    a3m = []
+    for sequence, name in zip(sequences, names):
+        a3m.append('>' + name + '\n')
+        a3m.append(sequence + '\n')
+    return ''.join(a3m)
+
+
+class Kalign:
+    """Python wrapper of the Kalign binary."""
+
+    def __init__(self, *, binary_path: str):
+        """Initializes the Python Kalign wrapper.
+
+        Args:
+            binary_path: The path to the Kalign binary.
+
+        Raises:
+            RuntimeError: If Kalign binary not found within the path.
+        """
+        self.binary_path = binary_path
+
+    def align(self, sequences: Sequence[str]) -> str:
+        """Aligns the sequences and returns the alignment in A3M string.
+
+        Args:
+            sequences: A list of query sequence strings. The sequences have to be at
+                least 6 residues long (Kalign requires this). Note that the order in
+                which you give the sequences might alter the output slightly as
+                different alignment tree might get constructed.
+
+        Returns:
+            A string with the alignment in a3m format.
+
+        Raises:
+            RuntimeError: If Kalign fails.
+            ValueError: If any of the sequences is less than 6 residues long.
+        """
+        logging.info('Aligning %d sequences', len(sequences))
+
+        for s in sequences:
+            if len(s) < 6:
+                raise ValueError(
+                    'Kalign requires all sequences to be at least 6 '
+                    'residues long. Got %s (%d residues).' % (s, len(s)))
+
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_fasta_path = os.path.join(query_tmp_dir, 'input.fasta')
+            output_a3m_path = os.path.join(query_tmp_dir, 'output.a3m')
+
+            with open(input_fasta_path, 'w') as f:
+                f.write(_to_a3m(sequences))
+
+            cmd = [
+                self.binary_path,
+                '-i',
+                input_fasta_path,
+                '-o',
+                output_a3m_path,
+                '-format',
+                'fasta',
+            ]
+
+            logging.info('Launching subprocess "%s"', ' '.join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+            with utils.timing('Kalign query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    'Kalign stdout:\n%s\n\nstderr:\n%s\n',
+                    stdout.decode('utf-8'),
+                    stderr.decode('utf-8'),
+                )
+
+            if retcode:
+                raise RuntimeError(
+                    'Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n' %
+                    (stdout.decode('utf-8'), stderr.decode('utf-8')))
+
+            with open(output_a3m_path) as f:
+                a3m = f.read()
+
+            return a3m
diff --git a/modelscope/models/science/unifold/msa/tools/utils.py b/modelscope/models/science/unifold/msa/tools/utils.py
new file mode 100644
index 00000000..1c2af936
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/tools/utils.py
@@ -0,0 +1,40 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common utilities for data pipeline tools."""
+import contextlib
+import shutil
+import tempfile
+import time
+from typing import Optional
+
+from absl import logging
+
+
+@contextlib.contextmanager
+def tmpdir_manager(base_dir: Optional[str] = None):
+    """Context manager that deletes a temporary directory on exit."""
+    tmpdir = tempfile.mkdtemp(dir=base_dir)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+@contextlib.contextmanager
+def timing(msg: str):
+    logging.info('Started %s', msg)
+    tic = time.time()
+    yield
+    toc = time.time()
+    logging.info('Finished %s in %.3f seconds', msg, toc - tic)
diff --git a/modelscope/models/science/unifold/msa/utils.py b/modelscope/models/science/unifold/msa/utils.py
new file mode 100644
index 00000000..50e380d4
--- /dev/null
+++ b/modelscope/models/science/unifold/msa/utils.py
@@ -0,0 +1,89 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import os
+from typing import Mapping, Sequence
+
+import json
+from absl import logging
+
+from modelscope.models.science.unifold.data import protein
+
+
+def get_chain_id_map(
+    sequences: Sequence[str],
+    descriptions: Sequence[str],
+):
+    """
+    Makes a mapping from PDB-format chain ID to sequence and description,
+    and parses the order of multi-chains
+    """
+    unique_seqs = []
+    for seq in sequences:
+        if seq not in unique_seqs:
+            unique_seqs.append(seq)
+
+    chain_id_map = {
+        chain_id: {
+            'descriptions': [],
+            'sequence': seq
+        }
+        for chain_id, seq in zip(protein.PDB_CHAIN_IDS, unique_seqs)
+    }
+    chain_order = []
+
+    for seq, des in zip(sequences, descriptions):
+        chain_id = protein.PDB_CHAIN_IDS[unique_seqs.index(seq)]
+        chain_id_map[chain_id]['descriptions'].append(des)
+        chain_order.append(chain_id)
+
+    return chain_id_map, chain_order
+
+
+def divide_multi_chains(
+    fasta_name: str,
+    output_dir_base: str,
+    sequences: Sequence[str],
+    descriptions: Sequence[str],
+):
+    """
+    Divides the multi-chains fasta into several single fasta files and
+    records multi-chains mapping information.
+    """
+    if len(sequences) != len(descriptions):
+        raise ValueError('sequences and descriptions must have equal length. '
+                         f'Got {len(sequences)} != {len(descriptions)}.')
+    if len(sequences) > protein.PDB_MAX_CHAINS:
+        raise ValueError(
+            'Cannot process more chains than the PDB format supports. '
+            f'Got {len(sequences)} chains.')
+
+    chain_id_map, chain_order = get_chain_id_map(sequences, descriptions)
+
+    output_dir = os.path.join(output_dir_base, fasta_name)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    chain_id_map_path = os.path.join(output_dir, 'chain_id_map.json')
+    with open(chain_id_map_path, 'w') as f:
+        json.dump(chain_id_map, f, indent=4, sort_keys=True)
+
+    chain_order_path = os.path.join(output_dir, 'chains.txt')
+    with open(chain_order_path, 'w') as f:
+        f.write(' '.join(chain_order))
+
+    logging.info('Mapping multi-chains fasta with chain order: %s',
+                 ' '.join(chain_order))
+
+    temp_names = []
+    temp_paths = []
+    for chain_id in chain_id_map.keys():
+        temp_name = fasta_name + '_{}'.format(chain_id)
+        temp_path = os.path.join(output_dir, temp_name + '.fasta')
+        des = 'chain_{}'.format(chain_id)
+        seq = chain_id_map[chain_id]['sequence']
+        with open(temp_path, 'w') as f:
+            f.write('>' + des + '\n' + seq)
+        temp_names.append(temp_name)
+        temp_paths.append(temp_path)
+    return temp_names, temp_paths
diff --git a/modelscope/pipelines/science/__init__.py b/modelscope/pipelines/science/__init__.py
new file mode 100644
index 00000000..1f81809b
--- /dev/null
+++ b/modelscope/pipelines/science/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .protein_structure_pipeline import ProteinStructurePipeline
+
+else:
+    _import_structure = {
+        'protein_structure_pipeline': ['ProteinStructurePipeline']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/pipelines/science/protein_structure_pipeline.py b/modelscope/pipelines/science/protein_structure_pipeline.py
new file mode 100644
index 00000000..3dc51c72
--- /dev/null
+++ b/modelscope/pipelines/science/protein_structure_pipeline.py
@@ -0,0 +1,215 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import time
+from typing import Any, Dict, List, Optional, Union
+
+import json
+import numpy as np
+import torch
+from unicore.utils import tensor_tree_map
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Model
+from modelscope.models.science.unifold.config import model_config
+from modelscope.models.science.unifold.data import protein, residue_constants
+from modelscope.models.science.unifold.dataset import (UnifoldDataset,
+                                                       load_and_process)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Preprocessor, build_preprocessor
+from modelscope.utils.constant import Fields, Frameworks, Tasks
+from modelscope.utils.device import device_placement
+from modelscope.utils.hub import read_config
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ProteinStructurePipeline']
+
+
+def automatic_chunk_size(seq_len):
+    if seq_len < 512:
+        chunk_size = 256
+    elif seq_len < 1024:
+        chunk_size = 128
+    elif seq_len < 2048:
+        chunk_size = 32
+    elif seq_len < 3072:
+        chunk_size = 16
+    else:
+        chunk_size = 1
+    return chunk_size
+
+
+def load_feature_for_one_target(
+    config,
+    data_folder,
+    seed=0,
+    is_multimer=False,
+    use_uniprot=False,
+    symmetry_group=None,
+):
+    if not is_multimer:
+        uniprot_msa_dir = None
+        sequence_ids = ['A']
+        if use_uniprot:
+            uniprot_msa_dir = data_folder
+
+    else:
+        uniprot_msa_dir = data_folder
+        sequence_ids = open(os.path.join(data_folder,
+                                         'chains.txt')).readline().split()
+
+    if symmetry_group is None:
+        batch, _ = load_and_process(
+            config=config.data,
+            mode='predict',
+            seed=seed,
+            batch_idx=None,
+            data_idx=0,
+            is_distillation=False,
+            sequence_ids=sequence_ids,
+            monomer_feature_dir=data_folder,
+            uniprot_msa_dir=uniprot_msa_dir,
+        )
+    else:
+        raise NotImplementedError
+    batch = UnifoldDataset.collater([batch])
+    return batch
+
+
+@PIPELINES.register_module(
+    Tasks.protein_structure, module_name=Pipelines.protein_structure)
+class ProteinStructurePipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a protein structure pipeline for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported the protein structure task,
+            or a model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline(task='protein-structure',
+            >>>    model='DPTech/uni-fold-monomer')
+            >>> protein = 'LILNLRGGAFVSNTQITMADKQKKFINEIQEGDLVRSYSITDETFQQNAVTSIVKHEADQLCQINFGKQHVVC'
+            >>> print(pipeline_ins(protein))
+
+        """
+        import copy
+        model_path = copy.deepcopy(model) if isinstance(model, str) else None
+        cfg = read_config(model_path)  # only model is str
+        self.cfg = cfg
+        self.config = model_config(
+            cfg['pipeline']['model_name'])  # alphafold config
+        model = model if isinstance(
+            model, Model) else Model.from_pretrained(model_path)
+        self.postprocessor = cfg.pop('postprocessor', None)
+        if preprocessor is None:
+            preprocessor_cfg = cfg.preprocessor
+            preprocessor = build_preprocessor(preprocessor_cfg, Fields.science)
+        model.eval()
+        model.model.inference_mode()
+        model.model_dir = model_path
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def _sanitize_parameters(self, **pipeline_parameters):
+        return pipeline_parameters, pipeline_parameters, pipeline_parameters
+
+    def _process_single(self, input, *args, **kwargs) -> Dict[str, Any]:
+        preprocess_params = kwargs.get('preprocess_params', {})
+        forward_params = kwargs.get('forward_params', {})
+        postprocess_params = kwargs.get('postprocess_params', {})
+        out = self.preprocess(input, **preprocess_params)
+        with device_placement(self.framework, self.device_name):
+            with torch.no_grad():
+                out = self.forward(out, **forward_params)
+
+        out = self.postprocess(out, **postprocess_params)
+        return out
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        plddts = {}
+        ptms = {}
+
+        output_dir = os.path.join(self.preprocessor.output_dir_base,
+                                  inputs['target_id'])
+
+        pdbs = []
+        for seed in range(self.cfg['pipeline']['times']):
+            cur_seed = hash((42, seed)) % 100000
+            batch = load_feature_for_one_target(
+                self.config,
+                output_dir,
+                cur_seed,
+                is_multimer=inputs['is_multimer'],
+                use_uniprot=inputs['is_multimer'],
+                symmetry_group=self.preprocessor.symmetry_group,
+            )
+            seq_len = batch['aatype'].shape[-1]
+            self.model.model.globals.chunk_size = automatic_chunk_size(seq_len)
+
+            with torch.no_grad():
+                batch = {
+                    k: torch.as_tensor(v, device='cuda:0')
+                    for k, v in batch.items()
+                }
+                out = self.model(batch)
+
+            def to_float(x):
+                if x.dtype == torch.bfloat16 or x.dtype == torch.half:
+                    return x.float()
+                else:
+                    return x
+
+            # Toss out the recycling dimensions --- we don't need them anymore
+            batch = tensor_tree_map(lambda t: t[-1, 0, ...], batch)
+            batch = tensor_tree_map(to_float, batch)
+            out = tensor_tree_map(lambda t: t[0, ...], out[0])
+            out = tensor_tree_map(to_float, out)
+            batch = tensor_tree_map(lambda x: np.array(x.cpu()), batch)
+            out = tensor_tree_map(lambda x: np.array(x.cpu()), out)
+
+            plddt = out['plddt']
+            mean_plddt = np.mean(plddt)
+            plddt_b_factors = np.repeat(
+                plddt[..., None], residue_constants.atom_type_num, axis=-1)
+            # TODO: , may need to reorder chains, based on entity_ids
+            cur_protein = protein.from_prediction(
+                features=batch, result=out, b_factors=plddt_b_factors)
+            cur_save_name = (f'{cur_seed}')
+            plddts[cur_save_name] = str(mean_plddt)
+            if inputs[
+                    'is_multimer'] and self.preprocessor.symmetry_group is None:
+                ptms[cur_save_name] = str(np.mean(out['iptm+ptm']))
+            with open(os.path.join(output_dir, cur_save_name + '.pdb'),
+                      'w') as f:
+                f.write(protein.to_pdb(cur_protein))
+                pdbs.append(protein.to_pdb(cur_protein))
+
+        logger.info('plddts:' + str(plddts))
+        model_name = self.cfg['pipeline']['model_name']
+        score_name = f'{model_name}'
+        plddt_fname = score_name + '_plddt.json'
+
+        with open(os.path.join(output_dir, plddt_fname), 'w') as f:
+            json.dump(plddts, f, indent=4)
+        if ptms:
+            logger.info('ptms' + str(ptms))
+            ptm_fname = score_name + '_ptm.json'
+            with open(os.path.join(output_dir, ptm_fname), 'w') as f:
+                json.dump(ptms, f, indent=4)
+
+        return pdbs
+
+    def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params):
+        return inputs
diff --git a/modelscope/preprocessors/science/__init__.py b/modelscope/preprocessors/science/__init__.py
new file mode 100644
index 00000000..54b24887
--- /dev/null
+++ b/modelscope/preprocessors/science/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .unifold import (UniFoldPreprocessor)
+
+else:
+    _import_structure = {'unifold': ['UniFoldPreprocessor']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/science/uni_fold.py b/modelscope/preprocessors/science/uni_fold.py
new file mode 100644
index 00000000..2a44c885
--- /dev/null
+++ b/modelscope/preprocessors/science/uni_fold.py
@@ -0,0 +1,569 @@
+# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
+# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
+
+import gzip
+import hashlib
+import logging
+import os
+import pickle
+import random
+import re
+import tarfile
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from unittest import result
+
+import json
+import numpy as np
+import requests
+import torch
+from tqdm import tqdm
+
+from modelscope.metainfo import Preprocessors
+from modelscope.models.science.unifold.data import protein, residue_constants
+from modelscope.models.science.unifold.data.protein import PDB_CHAIN_IDS
+from modelscope.models.science.unifold.data.utils import compress_features
+from modelscope.models.science.unifold.msa import parsers, pipeline, templates
+from modelscope.models.science.unifold.msa.tools import hhsearch
+from modelscope.models.science.unifold.msa.utils import divide_multi_chains
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+
+__all__ = [
+    'UniFoldPreprocessor',
+]
+
+TQDM_BAR_FORMAT = '{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'
+DEFAULT_API_SERVER = 'https://api.colabfold.com'
+
+
+def run_mmseqs2(
+        x,
+        prefix,
+        use_env=True,
+        use_templates=False,
+        use_pairing=False,
+        host_url='https://api.colabfold.com') -> Tuple[List[str], List[str]]:
+    submission_endpoint = 'ticket/pair' if use_pairing else 'ticket/msa'
+
+    def submit(seqs, mode, N=101):
+        n, query = N, ''
+        for seq in seqs:
+            query += f'>{n}\n{seq}\n'
+            n += 1
+
+        res = requests.post(
+            f'{host_url}/{submission_endpoint}',
+            data={
+                'q': query,
+                'mode': mode
+            })
+        try:
+            out = res.json()
+        except ValueError:
+            out = {'status': 'ERROR'}
+        return out
+
+    def status(ID):
+        res = requests.get(f'{host_url}/ticket/{ID}')
+        try:
+            out = res.json()
+        except ValueError:
+            out = {'status': 'ERROR'}
+        return out
+
+    def download(ID, path):
+        res = requests.get(f'{host_url}/result/download/{ID}')
+        with open(path, 'wb') as out:
+            out.write(res.content)
+
+    # process input x
+    seqs = [x] if isinstance(x, str) else x
+
+    mode = 'env'
+    if use_pairing:
+        mode = ''
+        use_templates = False
+        use_env = False
+
+    # define path
+    path = f'{prefix}'
+    if not os.path.isdir(path):
+        os.mkdir(path)
+
+    # call mmseqs2 api
+    tar_gz_file = f'{path}/out_{mode}.tar.gz'
+    N, REDO = 101, True
+
+    # deduplicate and keep track of order
+    seqs_unique = []
+    # TODO this might be slow for large sets
+    [seqs_unique.append(x) for x in seqs if x not in seqs_unique]
+    Ms = [N + seqs_unique.index(seq) for seq in seqs]
+    # lets do it!
+    if not os.path.isfile(tar_gz_file):
+        TIME_ESTIMATE = 150 * len(seqs_unique)
+        with tqdm(total=TIME_ESTIMATE, bar_format=TQDM_BAR_FORMAT) as pbar:
+            while REDO:
+                pbar.set_description('SUBMIT')
+
+                # Resubmit job until it goes through
+                out = submit(seqs_unique, mode, N)
+                while out['status'] in ['UNKNOWN', 'RATELIMIT']:
+                    sleep_time = 5 + random.randint(0, 5)
+                    # logger.error(f"Sleeping for {sleep_time}s. Reason: {out['status']}")
+                    # resubmit
+                    time.sleep(sleep_time)
+                    out = submit(seqs_unique, mode, N)
+
+                if out['status'] == 'ERROR':
+                    error = 'MMseqs2 API is giving errors. Please confirm your input is a valid protein sequence.'
+                    error = error + 'If error persists, please try again an hour later.'
+                    raise Exception(error)
+
+                if out['status'] == 'MAINTENANCE':
+                    raise Exception(
+                        'MMseqs2 API is undergoing maintenance. Please try again in a few minutes.'
+                    )
+
+                # wait for job to finish
+                ID, TIME = out['id'], 0
+                pbar.set_description(out['status'])
+                while out['status'] in ['UNKNOWN', 'RUNNING', 'PENDING']:
+                    t = 5 + random.randint(0, 5)
+                    # logger.error(f"Sleeping for {t}s. Reason: {out['status']}")
+                    time.sleep(t)
+                    out = status(ID)
+                    pbar.set_description(out['status'])
+                    if out['status'] == 'RUNNING':
+                        TIME += t
+                        pbar.update(n=t)
+
+                if out['status'] == 'COMPLETE':
+                    if TIME < TIME_ESTIMATE:
+                        pbar.update(n=(TIME_ESTIMATE - TIME))
+                    REDO = False
+
+                if out['status'] == 'ERROR':
+                    REDO = False
+                    error = 'MMseqs2 API is giving errors. Please confirm your input is a valid protein sequence.'
+                    error = error + 'If error persists, please try again an hour later.'
+                    raise Exception(error)
+
+            # Download results
+            download(ID, tar_gz_file)
+
+    # prep list of a3m files
+    if use_pairing:
+        a3m_files = [f'{path}/pair.a3m']
+    else:
+        a3m_files = [f'{path}/uniref.a3m']
+        if use_env:
+            a3m_files.append(f'{path}/bfd.mgnify30.metaeuk30.smag30.a3m')
+
+    # extract a3m files
+    if any(not os.path.isfile(a3m_file) for a3m_file in a3m_files):
+        with tarfile.open(tar_gz_file) as tar_gz:
+            tar_gz.extractall(path)
+
+    # templates
+    if use_templates:
+        templates = {}
+
+        with open(f'{path}/pdb70.m8', 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                p = line.rstrip().split()
+                M, pdb, _, _ = p[0], p[1], p[2], p[10]  # qid, e_value
+                M = int(M)
+                if M not in templates:
+                    templates[M] = []
+                templates[M].append(pdb)
+
+        template_paths = {}
+        for k, TMPL in templates.items():
+            TMPL_PATH = f'{prefix}/templates_{k}'
+            if not os.path.isdir(TMPL_PATH):
+                os.mkdir(TMPL_PATH)
+                TMPL_LINE = ','.join(TMPL[:20])
+                os.system(
+                    f'curl -s -L {host_url}/template/{TMPL_LINE} | tar xzf - -C {TMPL_PATH}/'
+                )
+                os.system(
+                    f'cp {TMPL_PATH}/pdb70_a3m.ffindex {TMPL_PATH}/pdb70_cs219.ffindex'
+                )
+                os.system(f'touch {TMPL_PATH}/pdb70_cs219.ffdata')
+            template_paths[k] = TMPL_PATH
+
+    # gather a3m lines
+    a3m_lines = {}
+    for a3m_file in a3m_files:
+        update_M, M = True, None
+        with open(a3m_file, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                if len(line) > 0:
+                    if '\x00' in line:
+                        line = line.replace('\x00', '')
+                        update_M = True
+                    if line.startswith('>') and update_M:
+                        M = int(line[1:].rstrip())
+                        update_M = False
+                        if M not in a3m_lines:
+                            a3m_lines[M] = []
+                    a3m_lines[M].append(line)
+
+    # return results
+
+    a3m_lines = [''.join(a3m_lines[n]) for n in Ms]
+
+    if use_templates:
+        template_paths_ = []
+        for n in Ms:
+            if n not in template_paths:
+                template_paths_.append(None)
+                # print(f"{n-N}\tno_templates_found")
+            else:
+                template_paths_.append(template_paths[n])
+        template_paths = template_paths_
+
+    return (a3m_lines, template_paths) if use_templates else a3m_lines
+
+
+def get_null_template(query_sequence: Union[List[str], str],
+                      num_temp: int = 1) -> Dict[str, Any]:
+    ln = (
+        len(query_sequence) if isinstance(query_sequence, str) else sum(
+            len(s) for s in query_sequence))
+    output_templates_sequence = 'A' * ln
+    # output_confidence_scores = np.full(ln, 1.0)
+
+    templates_all_atom_positions = np.zeros(
+        (ln, templates.residue_constants.atom_type_num, 3))
+    templates_all_atom_masks = np.zeros(
+        (ln, templates.residue_constants.atom_type_num))
+    templates_aatype = templates.residue_constants.sequence_to_onehot(
+        output_templates_sequence,
+        templates.residue_constants.HHBLITS_AA_TO_ID)
+    template_features = {
+        'template_all_atom_positions':
+        np.tile(templates_all_atom_positions[None], [num_temp, 1, 1, 1]),
+        'template_all_atom_masks':
+        np.tile(templates_all_atom_masks[None], [num_temp, 1, 1]),
+        'template_sequence': ['none'.encode()] * num_temp,
+        'template_aatype':
+        np.tile(np.array(templates_aatype)[None], [num_temp, 1, 1]),
+        'template_domain_names': ['none'.encode()] * num_temp,
+        'template_sum_probs':
+        np.zeros([num_temp], dtype=np.float32),
+    }
+    return template_features
+
+
+def get_template(a3m_lines: str, template_path: str,
+                 query_sequence: str) -> Dict[str, Any]:
+    template_featurizer = templates.HhsearchHitFeaturizer(
+        mmcif_dir=template_path,
+        max_template_date='2100-01-01',
+        max_hits=20,
+        kalign_binary_path='kalign',
+        release_dates_path=None,
+        obsolete_pdbs_path=None,
+    )
+
+    hhsearch_pdb70_runner = hhsearch.HHSearch(
+        binary_path='hhsearch', databases=[f'{template_path}/pdb70'])
+
+    hhsearch_result = hhsearch_pdb70_runner.query(a3m_lines)
+    hhsearch_hits = pipeline.parsers.parse_hhr(hhsearch_result)
+    templates_result = template_featurizer.get_templates(
+        query_sequence=query_sequence, hits=hhsearch_hits)
+    return dict(templates_result.features)
+
+
+@PREPROCESSORS.register_module(
+    Fields.science, module_name=Preprocessors.unifold_preprocessor)
+class UniFoldPreprocessor(Preprocessor):
+
+    def __init__(self, **cfg):
+        self.symmetry_group = cfg['symmetry_group']  # "C1"
+        if not self.symmetry_group:
+            self.symmetry_group = None
+        self.MIN_SINGLE_SEQUENCE_LENGTH = 16  # TODO: change to cfg
+        self.MAX_SINGLE_SEQUENCE_LENGTH = 1000
+        self.MAX_MULTIMER_LENGTH = 1000
+        self.jobname = 'unifold'
+        self.output_dir_base = './unifold-predictions'
+        os.makedirs(self.output_dir_base, exist_ok=True)
+
+    def clean_and_validate_sequence(self, input_sequence: str, min_length: int,
+                                    max_length: int) -> str:
+        clean_sequence = input_sequence.translate(
+            str.maketrans('', '', ' \n\t')).upper()
+        aatypes = set(residue_constants.restypes)  # 20 standard aatypes.
+        if not set(clean_sequence).issubset(aatypes):
+            raise ValueError(
+                f'Input sequence contains non-amino acid letters: '
+                f'{set(clean_sequence) - aatypes}. AlphaFold only supports 20 standard '
+                'amino acids as inputs.')
+        if len(clean_sequence) < min_length:
+            raise ValueError(
+                f'Input sequence is too short: {len(clean_sequence)} amino acids, '
+                f'while the minimum is {min_length}')
+        if len(clean_sequence) > max_length:
+            raise ValueError(
+                f'Input sequence is too long: {len(clean_sequence)} amino acids, while '
+                f'the maximum is {max_length}. You may be able to run it with the full '
+                f'Uni-Fold system depending on your resources (system memory, '
+                f'GPU memory).')
+        return clean_sequence
+
+    def validate_input(self, input_sequences: Sequence[str],
+                       symmetry_group: str, min_length: int, max_length: int,
+                       max_multimer_length: int) -> Tuple[Sequence[str], bool]:
+        """Validates and cleans input sequences and determines which model to use."""
+        sequences = []
+
+        for input_sequence in input_sequences:
+            if input_sequence.strip():
+                input_sequence = self.clean_and_validate_sequence(
+                    input_sequence=input_sequence,
+                    min_length=min_length,
+                    max_length=max_length)
+                sequences.append(input_sequence)
+
+        if symmetry_group is not None and symmetry_group != 'C1':
+            if symmetry_group.startswith(
+                    'C') and symmetry_group[1:].isnumeric():
+                print(
+                    f'Using UF-Symmetry with group {symmetry_group}. If you do not '
+                    f'want to use UF-Symmetry, please use `C1` and copy the AU '
+                    f'sequences to the count in the assembly.')
+                is_multimer = (len(sequences) > 1)
+                return sequences, is_multimer, symmetry_group
+            else:
+                raise ValueError(
+                    f'UF-Symmetry does not support symmetry group '
+                    f'{symmetry_group} currently. Cyclic groups (Cx) are '
+                    f'supported only.')
+
+        elif len(sequences) == 1:
+            print('Using the single-chain model.')
+            return sequences, False, None
+
+        elif len(sequences) > 1:
+            total_multimer_length = sum([len(seq) for seq in sequences])
+            if total_multimer_length > max_multimer_length:
+                raise ValueError(
+                    f'The total length of multimer sequences is too long: '
+                    f'{total_multimer_length}, while the maximum is '
+                    f'{max_multimer_length}. Please use the full AlphaFold '
+                    f'system for long multimers.')
+            print(f'Using the multimer model with {len(sequences)} sequences.')
+            return sequences, True, None
+
+        else:
+            raise ValueError(
+                'No input amino acid sequence provided, please provide at '
+                'least one sequence.')
+
+    def add_hash(self, x, y):
+        return x + '_' + hashlib.sha1(y.encode()).hexdigest()[:5]
+
+    def get_msa_and_templates(
+        self,
+        jobname: str,
+        query_seqs_unique: Union[str, List[str]],
+        result_dir: Path,
+        msa_mode: str,
+        use_templates: bool,
+        homooligomers_num: int = 1,
+        host_url: str = DEFAULT_API_SERVER,
+    ) -> Tuple[Optional[List[str]], Optional[List[str]], List[str], List[int],
+               List[Dict[str, Any]]]:
+
+        use_env = msa_mode == 'MMseqs2'
+
+        template_features = []
+        if use_templates:
+            a3m_lines_mmseqs2, template_paths = run_mmseqs2(
+                query_seqs_unique,
+                str(result_dir.joinpath(jobname)),
+                use_env,
+                use_templates=True,
+                host_url=host_url,
+            )
+            if template_paths is None:
+                for index in range(0, len(query_seqs_unique)):
+                    template_feature = get_null_template(
+                        query_seqs_unique[index])
+                    template_features.append(template_feature)
+            else:
+                for index in range(0, len(query_seqs_unique)):
+                    if template_paths[index] is not None:
+                        template_feature = get_template(
+                            a3m_lines_mmseqs2[index],
+                            template_paths[index],
+                            query_seqs_unique[index],
+                        )
+                        if len(template_feature['template_domain_names']) == 0:
+                            template_feature = get_null_template(
+                                query_seqs_unique[index])
+                    else:
+                        template_feature = get_null_template(
+                            query_seqs_unique[index])
+                    template_features.append(template_feature)
+        else:
+            for index in range(0, len(query_seqs_unique)):
+                template_feature = get_null_template(query_seqs_unique[index])
+                template_features.append(template_feature)
+
+        if msa_mode == 'single_sequence':
+            a3m_lines = []
+            num = 101
+            for i, seq in enumerate(query_seqs_unique):
+                a3m_lines.append('>' + str(num + i) + '\n' + seq)
+        else:
+            # find normal a3ms
+            a3m_lines = run_mmseqs2(
+                query_seqs_unique,
+                str(result_dir.joinpath(jobname)),
+                use_env,
+                use_pairing=False,
+                host_url=host_url,
+            )
+        if len(query_seqs_unique) > 1:
+            # find paired a3m if not a homooligomers
+            paired_a3m_lines = run_mmseqs2(
+                query_seqs_unique,
+                str(result_dir.joinpath(jobname)),
+                use_env,
+                use_pairing=True,
+                host_url=host_url,
+            )
+        else:
+            num = 101
+            paired_a3m_lines = []
+            for i in range(0, homooligomers_num):
+                paired_a3m_lines.append('>' + str(num + i) + '\n'
+                                        + query_seqs_unique[0] + '\n')
+
+        return (
+            a3m_lines,
+            paired_a3m_lines,
+            template_features,
+        )
+
+    def __call__(self, data: Union[str, Tuple]):
+        if isinstance(data, str):
+            data = [data, '', '', '']
+        basejobname = ''.join(data)
+        basejobname = re.sub(r'\W+', '', basejobname)
+        target_id = self.add_hash(self.jobname, basejobname)
+
+        sequences, is_multimer, _ = self.validate_input(
+            input_sequences=data,
+            symmetry_group=self.symmetry_group,
+            min_length=self.MIN_SINGLE_SEQUENCE_LENGTH,
+            max_length=self.MAX_SINGLE_SEQUENCE_LENGTH,
+            max_multimer_length=self.MAX_MULTIMER_LENGTH)
+
+        descriptions = [
+            '> ' + target_id + ' seq' + str(ii)
+            for ii in range(len(sequences))
+        ]
+
+        if is_multimer:
+            divide_multi_chains(target_id, self.output_dir_base, sequences,
+                                descriptions)
+
+        s = []
+        for des, seq in zip(descriptions, sequences):
+            s += [des, seq]
+
+        unique_sequences = []
+        [
+            unique_sequences.append(x) for x in sequences
+            if x not in unique_sequences
+        ]
+
+        if len(unique_sequences) == 1:
+            homooligomers_num = len(sequences)
+        else:
+            homooligomers_num = 1
+
+        with open(f'{self.jobname}.fasta', 'w') as f:
+            f.write('\n'.join(s))
+
+        result_dir = Path(self.output_dir_base)
+        output_dir = os.path.join(self.output_dir_base, target_id)
+
+        # msa_mode = 'single_sequence'
+        msa_mode = 'MMseqs2'
+        use_templates = True
+
+        unpaired_msa, paired_msa, template_results = self.get_msa_and_templates(
+            target_id,
+            unique_sequences,
+            result_dir=result_dir,
+            msa_mode=msa_mode,
+            use_templates=use_templates,
+            homooligomers_num=homooligomers_num)
+
+        features = []
+        pair_features = []
+
+        for idx, seq in enumerate(unique_sequences):
+            chain_id = PDB_CHAIN_IDS[idx]
+            sequence_features = pipeline.make_sequence_features(
+                sequence=seq,
+                description=f'> {self.jobname} seq {chain_id}',
+                num_res=len(seq))
+            monomer_msa = parsers.parse_a3m(unpaired_msa[idx])
+            msa_features = pipeline.make_msa_features([monomer_msa])
+            template_features = template_results[idx]
+            feature_dict = {
+                **sequence_features,
+                **msa_features,
+                **template_features
+            }
+            feature_dict = compress_features(feature_dict)
+            features_output_path = os.path.join(
+                output_dir, '{}.feature.pkl.gz'.format(chain_id))
+            pickle.dump(
+                feature_dict,
+                gzip.GzipFile(features_output_path, 'wb'),
+                protocol=4)
+            features.append(feature_dict)
+
+            if is_multimer:
+                multimer_msa = parsers.parse_a3m(paired_msa[idx])
+                pair_features = pipeline.make_msa_features([multimer_msa])
+                pair_feature_dict = compress_features(pair_features)
+                uniprot_output_path = os.path.join(
+                    output_dir, '{}.uniprot.pkl.gz'.format(chain_id))
+                pickle.dump(
+                    pair_feature_dict,
+                    gzip.GzipFile(uniprot_output_path, 'wb'),
+                    protocol=4,
+                )
+                pair_features.append(pair_feature_dict)
+
+        # return features, pair_features, target_id
+        return {
+            'features': features,
+            'pair_features': pair_features,
+            'target_id': target_id,
+            'is_multimer': is_multimer,
+        }
+
+
+if __name__ == '__main__':
+    proc = UniFoldPreprocessor()
+    protein_example = 'LILNLRGGAFVSNTQITMADKQKKFINEIQEGDLVRSYSITDETFQQNAVTSIVKHEADQLCQINFGKQHVVC' + \
+        'TVNHRFYDPESKLWKSVCPHPGSGISFLKKYDYLLSEEGEKLQITEIKTFTTKQPVFIYHIQVENNHNFFANGVLAHAMQVSI'
+    features, pair_features = proc.__call__(protein_example)
+    import ipdb
+    ipdb.set_trace()
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 86b7bb7d..45bda324 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -9,6 +9,7 @@ class Fields(object):
     nlp = 'nlp'
     audio = 'audio'
     multi_modal = 'multi-modal'
+    science = 'science'
 
 
 class CVTasks(object):
@@ -151,6 +152,10 @@ class MultiModalTasks(object):
     image_text_retrieval = 'image-text-retrieval'
 
 
+class ScienceTasks(object):
+    protein_structure = 'protein-structure'
+
+
 class TasksIODescriptions(object):
     image_to_image = 'image_to_image',
     images_to_image = 'images_to_image',
@@ -167,7 +172,7 @@ class TasksIODescriptions(object):
     generative_multi_modal_embedding = 'generative_multi_modal_embedding'
 
 
-class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
+class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks, ScienceTasks):
     """ Names for tasks supported by modelscope.
 
     Holds the standard task name to use for identifying different tasks.
@@ -196,6 +201,10 @@ class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks):
                     getattr(Tasks, attr) for attr in dir(MultiModalTasks)
                     if not attr.startswith('__')
                 ],
+                Fields.science: [
+                    getattr(Tasks, attr) for attr in dir(ScienceTasks)
+                    if not attr.startswith('__')
+                ],
             }
 
             for field, tasks in field_dict.items():
diff --git a/requirements/science.txt b/requirements/science.txt
new file mode 100644
index 00000000..72994f72
--- /dev/null
+++ b/requirements/science.txt
@@ -0,0 +1,6 @@
+iopath
+lmdb
+ml_collections
+scipy
+tensorboardX
+tokenizers
diff --git a/tests/pipelines/test_unifold.py b/tests/pipelines/test_unifold.py
new file mode 100644
index 00000000..df35dc5e
--- /dev/null
+++ b/tests/pipelines/test_unifold.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.protein_structure
+        self.model_id = 'DPTech/uni-fold-monomer'
+        self.model_id_multimer = 'DPTech/uni-fold-multimer'
+
+        self.protein = 'MGLPKKALKESQLQFLTAGTAVSDSSHQTYKVSFIENGVIKNAFYKKLDPKNHYPELLAKISVAVSLFKRIFQGRRSAEERLVFDD'
+        self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \
+            'NIAALKNHIDKIKPIAMQIYKKYSKNIP'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        model_dir = snapshot_download(self.model_id)
+        mono_pipeline_ins = pipeline(task=self.task, model=model_dir)
+        _ = mono_pipeline_ins(self.protein)
+
+        model_dir1 = snapshot_download(self.model_id_multimer)
+        multi_pipeline_ins = pipeline(task=self.task, model=model_dir1)
+        _ = multi_pipeline_ins(self.protein_multimer)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 781fe49d63800f70821ad87c8d3c55730a767d95 Mon Sep 17 00:00:00 2001
From: "zhangyanzhao.zyz" <zhangyanzhao.zyz@alibaba-inc.com>
Date: Wed, 26 Oct 2022 09:44:25 +0800
Subject: [PATCH 123/129] =?UTF-8?q?[to=20#42322933]=E4=BF=AE=E6=AD=A3finet?=
 =?UTF-8?q?une=20text=20ranking=20bugs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

之前的finetune代码当dataset最后长度不足制定batch size时会出错，现已修正
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10524066
---
 modelscope/models/nlp/bert/text_ranking.py    | 19 +++++++++++--------
 .../task_datasets/text_ranking_dataset.py     |  3 +--
 tests/trainers/test_finetune_text_ranking.py  | 10 +++++-----
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/modelscope/models/nlp/bert/text_ranking.py b/modelscope/models/nlp/bert/text_ranking.py
index 79a63045..d6bbf277 100644
--- a/modelscope/models/nlp/bert/text_ranking.py
+++ b/modelscope/models/nlp/bert/text_ranking.py
@@ -18,14 +18,12 @@ logger = logging.get_logger(__name__)
 @MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
 class BertForTextRanking(BertForSequenceClassification):
 
-    def __init__(self, config, **kwargs):
+    def __init__(self, config, *args, **kwargs):
         super().__init__(config)
-        self.train_batch_size = kwargs.get('train_batch_size', 4)
+        neg_sample = kwargs.get('neg_sample', 8)
+        self.neg_sample = neg_sample
         setattr(self, self.base_model_prefix,
                 BertModel(self.config, add_pooling_layer=True))
-        self.register_buffer(
-            'target_label',
-            torch.zeros(self.train_batch_size, dtype=torch.long))
 
     def forward(self,
                 input_ids=None,
@@ -55,9 +53,12 @@ class BertForTextRanking(BertForSequenceClassification):
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         if self.base_model.training:
-            scores = logits.view(self.train_batch_size, -1)
+            scores = logits.view(-1, self.neg_sample + 1)
+            batch_size = scores.size(0)
             loss_fct = torch.nn.CrossEntropyLoss()
-            loss = loss_fct(scores, self.target_label)
+            target_label = torch.zeros(
+                batch_size, dtype=torch.long, device=scores.device)
+            loss = loss_fct(scores, target_label)
             return AttentionTextClassificationModelOutput(
                 loss=loss,
                 logits=logits,
@@ -78,9 +79,11 @@ class BertForTextRanking(BertForSequenceClassification):
         Returns:
             The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
         """
-
         num_labels = kwargs.get('num_labels', 1)
+        neg_sample = kwargs.get('neg_sample', 4)
         model_args = {} if num_labels is None else {'num_labels': num_labels}
+        if neg_sample is not None:
+            model_args['neg_sample'] = neg_sample
 
         model_dir = kwargs.get('model_dir')
         model = super(Model, cls).from_pretrained(
diff --git a/modelscope/msdatasets/task_datasets/text_ranking_dataset.py b/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
index dd44f7c2..54276843 100644
--- a/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
+++ b/modelscope/msdatasets/task_datasets/text_ranking_dataset.py
@@ -39,8 +39,7 @@ class TextRankingDataset(TorchTaskDataset):
                                                    ['title', 'text'])
         self.qid_field = self.dataset_config.get('qid_field', 'query_id')
         if mode == ModeKeys.TRAIN:
-            train_config = kwargs.get('train', {})
-            self.neg_samples = train_config.get('neg_samples', 4)
+            self.neg_samples = self.dataset_config.get('neg_sample', 4)
 
         super().__init__(datasets, mode, preprocessor, **kwargs)
 
diff --git a/tests/trainers/test_finetune_text_ranking.py b/tests/trainers/test_finetune_text_ranking.py
index 3561cb46..6e97310d 100644
--- a/tests/trainers/test_finetune_text_ranking.py
+++ b/tests/trainers/test_finetune_text_ranking.py
@@ -63,6 +63,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
     def test_finetune_msmarco(self):
 
         def cfg_modify_fn(cfg):
+            neg_sample = 4
             cfg.task = 'text-ranking'
             cfg['preprocessor'] = {'type': 'text-ranking'}
             cfg.train.optimizer.lr = 2e-5
@@ -73,7 +74,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'pos_sequence': 'positive_passages',
                     'neg_sequence': 'negative_passages',
                     'text_fileds': ['title', 'text'],
-                    'qid_field': 'query_id'
+                    'qid_field': 'query_id',
+                    'neg_sample': neg_sample
                 },
                 'val': {
                     'type': 'bert',
@@ -84,7 +86,6 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'qid_field': 'query_id'
                 },
             }
-            cfg['train']['neg_samples'] = 4
             cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
             cfg.train.max_epochs = 1
             cfg.train.train_batch_size = 4
@@ -96,6 +97,7 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'by_epoch': False
                 }
             }
+            cfg.model['neg_sample'] = 4
             cfg.train.hooks = [{
                 'type': 'CheckpointHook',
                 'interval': 1
@@ -151,7 +153,6 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
                     'qid_field': 'query_id'
                 },
             }
-            cfg['train']['neg_samples'] = 4
             cfg['evaluation']['dataloader']['batch_size_per_gpu'] = 30
             cfg.train.max_epochs = 1
             cfg.train.train_batch_size = 4
@@ -180,9 +181,8 @@ class TestFinetuneSequenceClassification(unittest.TestCase):
 
         # load dataset
         ds = MsDataset.load('dureader-retrieval-ranking', 'zyznull')
-        train_ds = ds['train'].to_hf_dataset()
+        train_ds = ds['train'].to_hf_dataset().shard(1000, index=0)
         dev_ds = ds['dev'].to_hf_dataset()
-
         model_id = 'damo/nlp_rom_passage-ranking_chinese-base'
         self.finetune(
             model_id=model_id,

From 5190c7de114740c25ebc3b686558a4d81adf880d Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Wed, 26 Oct 2022 11:53:52 +0800
Subject: [PATCH 124/129] [to #41669377] tts using default master revision
 model in UT

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10526747
---
 tests/pipelines/test_text_to_speech.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 9a1cd7b1..50807e23 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -9,7 +9,6 @@ import unittest
 import torch
 from scipy.io.wavfile import write
 
-from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -53,9 +52,8 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
     def test_pipeline(self):
         for i in range(len(self.zhcn_voices)):
             logger.info('test %s' % self.zhcn_voices[i])
-            model = Model.from_pretrained(
-                model_name_or_path=self.zhcn_models[i], revision='pytorch_am')
-            sambert_hifigan_tts = pipeline(task=self.task, model=model)
+            sambert_hifigan_tts = pipeline(
+                task=self.task, model=self.zhcn_models[i])
             self.assertTrue(sambert_hifigan_tts is not None)
             output = sambert_hifigan_tts(input=self.zhcn_text)
             self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])
@@ -63,9 +61,8 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
             write('output_%s.wav' % self.zhcn_voices[i], 16000, pcm)
         for i in range(len(self.en_voices)):
             logger.info('test %s' % self.en_voices[i])
-            model = Model.from_pretrained(
-                model_name_or_path=self.en_models[i], revision='pytorch_am')
-            sambert_hifigan_tts = pipeline(task=self.task, model=model)
+            sambert_hifigan_tts = pipeline(
+                task=self.task, model=self.en_models[i])
             self.assertTrue(sambert_hifigan_tts is not None)
             output = sambert_hifigan_tts(input=self.en_text)
             self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM])

From 384377b8f5502d160391c2bf78741b34ace40ef0 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Wed, 26 Oct 2022 13:55:51 +0800
Subject: [PATCH 125/129] * [to #45486649]feat: modelscope model version use
 model repo tag, unsupport branch or commit it, client user-agent header
 unified

---
 .dev_scripts/ci_container_test.sh           |  38 ++--
 .dev_scripts/dockerci.sh                    |  68 ++++---
 modelscope/__init__.py                      |   4 +-
 modelscope/hub/api.py                       | 190 ++++++++++++++++----
 modelscope/hub/constants.py                 |   3 +
 modelscope/hub/deploy.py                    |  26 ++-
 modelscope/hub/errors.py                    |  34 ++++
 modelscope/hub/file_download.py             | 104 ++++-------
 modelscope/hub/git.py                       |  20 +++
 modelscope/hub/repository.py                |  46 ++++-
 modelscope/hub/snapshot_download.py         |  26 +--
 modelscope/hub/utils/utils.py               |  15 +-
 modelscope/msdatasets/ms_dataset.py         |   6 +-
 modelscope/utils/constant.py                |   4 +-
 modelscope/version.py                       |   4 +
 tests/hub/test_hub_operation.py             |  36 ++--
 tests/hub/test_hub_private_files.py         |  59 ++++--
 tests/hub/test_hub_private_repository.py    |   5 +-
 tests/hub/test_hub_repository.py            |  12 +-
 tests/hub/test_hub_revision.py              | 145 +++++++++++++++
 tests/hub/test_hub_revision_release_mode.py | 190 ++++++++++++++++++++
 21 files changed, 825 insertions(+), 210 deletions(-)
 create mode 100644 tests/hub/test_hub_revision.py
 create mode 100644 tests/hub/test_hub_revision_release_mode.py

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 129a6c25..5922d1fb 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -1,22 +1,28 @@
-awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-pip install -r requirements/tests.txt
+echo "Testing envs"
+printenv
+echo "ENV END"
+if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
+    awk -F: '/^[^#]/ { print $1 }' requirements/framework.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/audio.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/cv.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/multi-modal.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    awk -F: '/^[^#]/ { print $1 }' requirements/nlp.txt | xargs -n 1 pip install -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    pip install -r requirements/tests.txt
 
-git config --global --add safe.directory /Maas-lib
+    git config --global --add safe.directory /Maas-lib
 
-# linter test
-# use internal project for pre-commit due to the network problem
-pre-commit run -c .pre-commit-config_local.yaml --all-files
-if [ $? -ne 0 ]; then
-    echo "linter test failed, please run 'pre-commit run --all-files' to check"
-    exit -1
+    # linter test
+    # use internal project for pre-commit due to the network problem
+    pre-commit run -c .pre-commit-config_local.yaml --all-files
+    if [ $? -ne 0 ]; then
+        echo "linter test failed, please run 'pre-commit run --all-files' to check"
+        exit -1
+    fi
+    # test with install
+    python setup.py install
+else
+    echo "Running case in release image, run case directly!"
 fi
-# test with install
-python setup.py install
-
 if [ $# -eq 0 ]; then
     ci_command="python tests/run.py --subprocess"
 else
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index af94b211..c502175b 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -20,28 +20,52 @@ do
 
   # pull image if there are update
   docker pull ${IMAGE_NAME}:${IMAGE_VERSION}
-  docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
-             --cpuset-cpus=${cpu_sets_arr[$gpu]} \
-             --gpus="device=$gpu" \
-             -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
-             -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-             -v $MODELSCOPE_HOME_CACHE/$gpu:/root \
-             -v /home/admin/pre-commit:/home/admin/pre-commit \
-             -e CI_TEST=True \
-             -e TEST_LEVEL=$TEST_LEVEL \
-             -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
-             -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
-             -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
-             -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
-             -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
-             -e TEST_LEVEL=$TEST_LEVEL \
-             -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
-             -e MODEL_TAG_URL=$MODEL_TAG_URL \
-             --workdir=$CODE_DIR_IN_CONTAINER \
-             --net host  \
-             ${IMAGE_NAME}:${IMAGE_VERSION} \
-             $CI_COMMAND
-
+  if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
+    docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
+              --cpuset-cpus=${cpu_sets_arr[$gpu]} \
+              --gpus="device=$gpu" \
+              -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
+              -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+              -v $MODELSCOPE_HOME_CACHE/$gpu:/root \
+              -v /home/admin/pre-commit:/home/admin/pre-commit \
+              -e CI_TEST=True \
+              -e TEST_LEVEL=$TEST_LEVEL \
+              -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+              -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
+              -e MODELSCOPE_SDK_DEBUG=True \
+              -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
+              -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
+              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
+              -e TEST_LEVEL=$TEST_LEVEL \
+              -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+              -e MODEL_TAG_URL=$MODEL_TAG_URL \
+              --workdir=$CODE_DIR_IN_CONTAINER \
+              --net host  \
+              ${IMAGE_NAME}:${IMAGE_VERSION} \
+              $CI_COMMAND
+  else
+    docker run --rm --name $CONTAINER_NAME --shm-size=16gb \
+              --cpuset-cpus=${cpu_sets_arr[$gpu]} \
+              --gpus="device=$gpu" \
+              -v $CODE_DIR:$CODE_DIR_IN_CONTAINER \
+              -v $MODELSCOPE_CACHE:$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+              -v $MODELSCOPE_HOME_CACHE/$gpu:/root \
+              -v /home/admin/pre-commit:/home/admin/pre-commit \
+              -e CI_TEST=True \
+              -e TEST_LEVEL=$TEST_LEVEL \
+              -e MODELSCOPE_CACHE=$MODELSCOPE_CACHE_DIR_IN_CONTAINER \
+              -e MODELSCOPE_DOMAIN=$MODELSCOPE_DOMAIN \
+              -e HUB_DATASET_ENDPOINT=$HUB_DATASET_ENDPOINT \
+              -e TEST_ACCESS_TOKEN_CITEST=$TEST_ACCESS_TOKEN_CITEST \
+              -e TEST_ACCESS_TOKEN_SDKDEV=$TEST_ACCESS_TOKEN_SDKDEV \
+              -e TEST_LEVEL=$TEST_LEVEL \
+              -e TEST_UPLOAD_MS_TOKEN=$TEST_UPLOAD_MS_TOKEN \
+              -e MODEL_TAG_URL=$MODEL_TAG_URL \
+              --workdir=$CODE_DIR_IN_CONTAINER \
+              --net host  \
+              ${IMAGE_NAME}:${IMAGE_VERSION} \
+              $CI_COMMAND
+  fi
   if [ $? -ne 0 ]; then
     echo "Running test case failed, please check the log!"
     exit -1
diff --git a/modelscope/__init__.py b/modelscope/__init__.py
index 0746d0e6..81fdf505 100644
--- a/modelscope/__init__.py
+++ b/modelscope/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .version import __version__
+from .version import __release_datetime__, __version__
 
-__all__ = ['__version__']
+__all__ = ['__version__', '__release_datetime__']
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index f8ca683a..00254f16 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -4,40 +4,45 @@
 import datetime
 import os
 import pickle
+import platform
 import shutil
 import tempfile
+import uuid
 from collections import defaultdict
 from http import HTTPStatus
 from http.cookiejar import CookieJar
 from os.path import expanduser
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import requests
 
+from modelscope import __version__
 from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_EMAIL,
                                       API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
                                       API_RESPONSE_FIELD_MESSAGE,
                                       API_RESPONSE_FIELD_USERNAME,
-                                      DEFAULT_CREDENTIALS_PATH, Licenses,
-                                      ModelVisibility)
+                                      DEFAULT_CREDENTIALS_PATH,
+                                      MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS,
+                                      Licenses, ModelVisibility)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
-                                   NotLoginException, RequestError,
-                                   datahub_raise_on_error,
+                                   NotLoginException, NoValidRevisionError,
+                                   RequestError, datahub_raise_on_error,
                                    handle_http_post_error,
-                                   handle_http_response, is_ok, raise_on_error)
+                                   handle_http_response, is_ok,
+                                   raise_for_http_status, raise_on_error)
 from modelscope.hub.git import GitCommandWrapper
 from modelscope.hub.repository import Repository
-from modelscope.hub.utils.utils import (get_endpoint,
-                                        model_id_to_group_owner_name)
 from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
-                                       DatasetFormations, DatasetMetaFormats,
-                                       DownloadMode, ModelFile)
+                                       DEFAULT_REPOSITORY_REVISION,
+                                       MASTER_MODEL_BRANCH, DatasetFormations,
+                                       DatasetMetaFormats, DownloadMode,
+                                       ModelFile)
 from modelscope.utils.logger import get_logger
-
-# yapf: enable
+from .utils.utils import (get_endpoint, get_release_datetime,
+                          model_id_to_group_owner_name)
 
 logger = get_logger()
 
@@ -46,6 +51,7 @@ class HubApi:
 
     def __init__(self, endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
+        self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
 
     def login(
         self,
@@ -65,8 +71,9 @@ class HubApi:
         </Tip>
         """
         path = f'{self.endpoint}/api/v1/login'
-        r = requests.post(path, json={'AccessToken': access_token})
-        r.raise_for_status()
+        r = requests.post(
+            path, json={'AccessToken': access_token}, headers=self.headers)
+        raise_for_http_status(r)
         d = r.json()
         raise_on_error(d)
 
@@ -120,7 +127,8 @@ class HubApi:
             'Visibility': visibility,  # server check
             'License': license
         }
-        r = requests.post(path, json=body, cookies=cookies)
+        r = requests.post(
+            path, json=body, cookies=cookies, headers=self.headers)
         handle_http_post_error(r, path, body)
         raise_on_error(r.json())
         model_repo_url = f'{get_endpoint()}/{model_id}'
@@ -140,8 +148,8 @@ class HubApi:
             raise ValueError('Token does not exist, please login first.')
         path = f'{self.endpoint}/api/v1/models/{model_id}'
 
-        r = requests.delete(path, cookies=cookies)
-        r.raise_for_status()
+        r = requests.delete(path, cookies=cookies, headers=self.headers)
+        raise_for_http_status(r)
         raise_on_error(r.json())
 
     def get_model_url(self, model_id):
@@ -170,7 +178,7 @@ class HubApi:
         owner_or_group, name = model_id_to_group_owner_name(model_id)
         path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}?Revision={revision}'
 
-        r = requests.get(path, cookies=cookies)
+        r = requests.get(path, cookies=cookies, headers=self.headers)
         handle_http_response(r, logger, cookies, model_id)
         if r.status_code == HTTPStatus.OK:
             if is_ok(r.json()):
@@ -178,7 +186,7 @@ class HubApi:
             else:
                 raise NotExistError(r.json()[API_RESPONSE_FIELD_MESSAGE])
         else:
-            r.raise_for_status()
+            raise_for_http_status(r)
 
     def push_model(self,
                    model_id: str,
@@ -187,7 +195,7 @@ class HubApi:
                    license: str = Licenses.APACHE_V2,
                    chinese_name: Optional[str] = None,
                    commit_message: Optional[str] = 'upload model',
-                   revision: Optional[str] = DEFAULT_MODEL_REVISION):
+                   revision: Optional[str] = DEFAULT_REPOSITORY_REVISION):
         """
         Upload model from a given directory to given repository. A valid model directory
         must contain a configuration.json file.
@@ -269,7 +277,7 @@ class HubApi:
                 date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
                 commit_message = '[automsg] push model %s to hub at %s' % (
                     model_id, date)
-            repo.push(commit_message=commit_message, branch=revision)
+            repo.push(commit_message=commit_message, local_branch=revision, remote_branch=revision)
         except Exception:
             raise
         finally:
@@ -294,7 +302,8 @@ class HubApi:
             path,
             data='{"Path":"%s", "PageNumber":%s, "PageSize": %s}' %
             (owner_or_group, page_number, page_size),
-            cookies=cookies)
+            cookies=cookies,
+            headers=self.headers)
         handle_http_response(r, logger, cookies, 'list_model')
         if r.status_code == HTTPStatus.OK:
             if is_ok(r.json()):
@@ -303,7 +312,7 @@ class HubApi:
             else:
                 raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
         else:
-            r.raise_for_status()
+            raise_for_http_status(r)
         return None
 
     def _check_cookie(self,
@@ -318,10 +327,70 @@ class HubApi:
                 raise ValueError('Token does not exist, please login first.')
         return cookies
 
+    def list_model_revisions(
+            self,
+            model_id: str,
+            cutoff_timestamp: int = None,
+            use_cookies: Union[bool, CookieJar] = False) -> List[str]:
+        """Get model branch and tags.
+
+        Args:
+            model_id (str): The model id
+            cutoff_timestamp (int): Tags created before the cutoff will be included.
+                                    The timestamp is represented by the seconds elasped from the epoch time.
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
+                        will load cookie from local. Defaults to False.
+        Returns:
+            Tuple[List[str], List[str]]: Return list of branch name and tags
+        """
+        cookies = self._check_cookie(use_cookies)
+        if cutoff_timestamp is None:
+            cutoff_timestamp = get_release_datetime()
+        path = f'{self.endpoint}/api/v1/models/{model_id}/revisions?EndTime=%s' % cutoff_timestamp
+        r = requests.get(path, cookies=cookies, headers=self.headers)
+        handle_http_response(r, logger, cookies, model_id)
+        d = r.json()
+        raise_on_error(d)
+        info = d[API_RESPONSE_FIELD_DATA]
+        # tags returned from backend are guaranteed to be ordered by create-time
+        tags = [x['Revision'] for x in info['RevisionMap']['Tags']
+                ] if info['RevisionMap']['Tags'] else []
+        return tags
+
+    def get_valid_revision(self, model_id: str, revision=None, cookies: Optional[CookieJar] = None):
+        release_timestamp = get_release_datetime()
+        current_timestamp = int(round(datetime.datetime.now().timestamp()))
+        # for active development in library codes (non-release-branches), release_timestamp
+        # is set to be a far-away-time-in-the-future, to ensure that we shall
+        # get the master-HEAD version from model repo by default (when no revision is provided)
+        if release_timestamp > current_timestamp + ONE_YEAR_SECONDS:
+            branches, tags = self.get_model_branches_and_tags(
+                model_id, use_cookies=False if cookies is None else cookies)
+            if revision is None:
+                revision = MASTER_MODEL_BRANCH
+                logger.info('Model revision not specified, use default: %s in development mode' % revision)
+            if revision not in branches and revision not in tags:
+                raise NotExistError('The model: %s has no branch or tag : %s .' % revision)
+        else:
+            revisions = self.list_model_revisions(
+                model_id, cutoff_timestamp=release_timestamp, use_cookies=False if cookies is None else cookies)
+            if revision is None:
+                if len(revisions) == 0:
+                    raise NoValidRevisionError('The model: %s has no valid revision!' % model_id)
+                # tags (revisions) returned from backend are guaranteed to be ordered by create-time
+                # we shall obtain the latest revision created earlier than release version of this branch
+                revision = revisions[0]
+                logger.info('Model revision not specified, use the latest revision: %s' % revision)
+            else:
+                if revision not in revisions:
+                    raise NotExistError(
+                        'The model: %s has no revision: %s !' % (model_id, revision))
+        return revision
+
     def get_model_branches_and_tags(
         self,
         model_id: str,
-        use_cookies: Union[bool, CookieJar] = False
+        use_cookies: Union[bool, CookieJar] = False,
     ) -> Tuple[List[str], List[str]]:
         """Get model branch and tags.
 
@@ -335,7 +404,7 @@ class HubApi:
         cookies = self._check_cookie(use_cookies)
 
         path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
-        r = requests.get(path, cookies=cookies)
+        r = requests.get(path, cookies=cookies, headers=self.headers)
         handle_http_response(r, logger, cookies, model_id)
         d = r.json()
         raise_on_error(d)
@@ -376,7 +445,11 @@ class HubApi:
         if root is not None:
             path = path + f'&Root={root}'
 
-        r = requests.get(path, cookies=cookies, headers=headers)
+        r = requests.get(
+            path, cookies=cookies, headers={
+                **headers,
+                **self.headers
+            })
 
         handle_http_response(r, logger, cookies, model_id)
         d = r.json()
@@ -392,10 +465,9 @@ class HubApi:
 
     def list_datasets(self):
         path = f'{self.endpoint}/api/v1/datasets'
-        headers = None
         params = {}
-        r = requests.get(path, params=params, headers=headers)
-        r.raise_for_status()
+        r = requests.get(path, params=params, headers=self.headers)
+        raise_for_http_status(r)
         dataset_list = r.json()[API_RESPONSE_FIELD_DATA]
         return [x['Name'] for x in dataset_list]
 
@@ -425,7 +497,7 @@ class HubApi:
         dataset_id = resp['Data']['Id']
         dataset_type = resp['Data']['Type']
         datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
-        r = requests.get(datahub_url)
+        r = requests.get(datahub_url, headers=self.headers)
         resp = r.json()
         datahub_raise_on_error(datahub_url, resp)
         file_list = resp['Data']
@@ -445,7 +517,7 @@ class HubApi:
                 datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                               f'Revision={revision}&FilePath={file_path}'
                 r = requests.get(datahub_url)
-                r.raise_for_status()
+                raise_for_http_status(r)
                 local_path = os.path.join(cache_dir, file_path)
                 if os.path.exists(local_path):
                     logger.warning(
@@ -490,7 +562,8 @@ class HubApi:
                       f'ststoken?Revision={revision}'
 
         cookies = requests.utils.dict_from_cookiejar(cookies)
-        r = requests.get(url=datahub_url, cookies=cookies)
+        r = requests.get(
+            url=datahub_url, cookies=cookies, headers=self.headers)
         resp = r.json()
         raise_on_error(resp)
         return resp['Data']
@@ -512,12 +585,12 @@ class HubApi:
 
     def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
-        r = requests.post(url)
-        r.raise_for_status()
+        r = requests.post(url, headers=self.headers)
+        raise_for_http_status(r)
 
     @staticmethod
     def datahub_remote_call(url):
-        r = requests.get(url)
+        r = requests.get(url, headers={'user-agent': ModelScopeConfig.get_user_agent()})
         resp = r.json()
         datahub_raise_on_error(url, resp)
         return resp['Data']
@@ -531,6 +604,7 @@ class ModelScopeConfig:
     COOKIES_FILE_NAME = 'cookies'
     GIT_TOKEN_FILE_NAME = 'git_token'
     USER_INFO_FILE_NAME = 'user'
+    USER_SESSION_ID_FILE_NAME = 'session'
 
     @staticmethod
     def make_sure_credential_path_exist():
@@ -559,6 +633,23 @@ class ModelScopeConfig:
                 return cookies
         return None
 
+    @staticmethod
+    def get_user_session_id():
+        session_path = os.path.join(ModelScopeConfig.path_credential,
+                                    ModelScopeConfig.USER_SESSION_ID_FILE_NAME)
+        session_id = ''
+        if os.path.exists(session_path):
+            with open(session_path, 'rb') as f:
+                session_id = str(f.readline().strip(), encoding='utf-8')
+                return session_id
+        if session_id == '' or len(session_id) != 32:
+            session_id = str(uuid.uuid4().hex)
+            ModelScopeConfig.make_sure_credential_path_exist()
+            with open(session_path, 'w+') as wf:
+                wf.write(session_id)
+
+        return session_id
+
     @staticmethod
     def save_token(token: str):
         ModelScopeConfig.make_sure_credential_path_exist()
@@ -607,3 +698,32 @@ class ModelScopeConfig:
         except FileNotFoundError:
             pass
         return token
+
+    @staticmethod
+    def get_user_agent(user_agent: Union[Dict, str, None] = None, ) -> str:
+        """Formats a user-agent string with basic info about a request.
+
+        Args:
+            user_agent (`str`, `dict`, *optional*):
+                The user agent info in the form of a dictionary or a single string.
+
+        Returns:
+            The formatted user-agent string.
+        """
+        env = 'custom'
+        if MODELSCOPE_ENVIRONMENT in os.environ:
+            env = os.environ[MODELSCOPE_ENVIRONMENT]
+
+        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % (
+            __version__,
+            platform.python_version(),
+            ModelScopeConfig.get_user_session_id(),
+            platform.platform(),
+            platform.processor(),
+            env,
+        )
+        if isinstance(user_agent, dict):
+            ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
+        elif isinstance(user_agent, str):
+            ua += ';' + user_agent
+        return ua
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index c8664597..730702c1 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -16,6 +16,9 @@ API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
 API_RESPONSE_FIELD_USERNAME = 'Username'
 API_RESPONSE_FIELD_EMAIL = 'Email'
 API_RESPONSE_FIELD_MESSAGE = 'Message'
+MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT'
+MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG'
+ONE_YEAR_SECONDS = 24 * 365 * 60 * 60
 
 
 class Licenses(object):
diff --git a/modelscope/hub/deploy.py b/modelscope/hub/deploy.py
index 397d9cc0..8cacde82 100644
--- a/modelscope/hub/deploy.py
+++ b/modelscope/hub/deploy.py
@@ -3,7 +3,6 @@ from abc import ABC
 from http import HTTPStatus
 from typing import Optional
 
-import attrs
 import json
 import requests
 from attrs import asdict, define, field, validators
@@ -12,7 +11,8 @@ from modelscope.hub.api import ModelScopeConfig
 from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_MESSAGE)
 from modelscope.hub.errors import (NotLoginException, NotSupportError,
-                                   RequestError, handle_http_response, is_ok)
+                                   RequestError, handle_http_response, is_ok,
+                                   raise_for_http_status)
 from modelscope.hub.utils.utils import get_endpoint
 from modelscope.utils.logger import get_logger
 
@@ -188,6 +188,7 @@ class ServiceDeployer(object):
 
     def __init__(self, endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
+        self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
         self.cookies = ModelScopeConfig.get_cookies()
         if self.cookies is None:
             raise NotLoginException(
@@ -227,12 +228,9 @@ class ServiceDeployer(object):
             resource=resource,
             provider=provider)
         path = f'{self.endpoint}/api/v1/deployer/endpoint'
-        body = attrs.asdict(create_params)
+        body = asdict(create_params)
         r = requests.post(
-            path,
-            json=body,
-            cookies=self.cookies,
-        )
+            path, json=body, cookies=self.cookies, headers=self.headers)
         handle_http_response(r, logger, self.cookies, 'create_service')
         if r.status_code >= HTTPStatus.OK and r.status_code < HTTPStatus.MULTIPLE_CHOICES:
             if is_ok(r.json()):
@@ -241,7 +239,7 @@ class ServiceDeployer(object):
             else:
                 raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
         else:
-            r.raise_for_status()
+            raise_for_http_status(r)
         return None
 
     def get(self, instance_name: str, provider: ServiceProviderParameters):
@@ -262,7 +260,7 @@ class ServiceDeployer(object):
         params = GetServiceParameters(provider=provider)
         path = '%s/api/v1/deployer/endpoint/%s?%s' % (
             self.endpoint, instance_name, params.to_query_str())
-        r = requests.get(path, cookies=self.cookies)
+        r = requests.get(path, cookies=self.cookies, headers=self.headers)
         handle_http_response(r, logger, self.cookies, 'get_service')
         if r.status_code == HTTPStatus.OK:
             if is_ok(r.json()):
@@ -271,7 +269,7 @@ class ServiceDeployer(object):
             else:
                 raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
         else:
-            r.raise_for_status()
+            raise_for_http_status(r)
         return None
 
     def delete(self, instance_name: str, provider: ServiceProviderParameters):
@@ -293,7 +291,7 @@ class ServiceDeployer(object):
         params = DeleteServiceParameters(provider=provider)
         path = '%s/api/v1/deployer/endpoint/%s?%s' % (
             self.endpoint, instance_name, params.to_query_str())
-        r = requests.delete(path, cookies=self.cookies)
+        r = requests.delete(path, cookies=self.cookies, headers=self.headers)
         handle_http_response(r, logger, self.cookies, 'delete_service')
         if r.status_code == HTTPStatus.OK:
             if is_ok(r.json()):
@@ -302,7 +300,7 @@ class ServiceDeployer(object):
             else:
                 raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
         else:
-            r.raise_for_status()
+            raise_for_http_status(r)
         return None
 
     def list(self,
@@ -328,7 +326,7 @@ class ServiceDeployer(object):
             provider=provider, skip=skip, limit=limit)
         path = '%s/api/v1/deployer/endpoint?%s' % (self.endpoint,
                                                    params.to_query_str())
-        r = requests.get(path, cookies=self.cookies)
+        r = requests.get(path, cookies=self.cookies, headers=self.headers)
         handle_http_response(r, logger, self.cookies, 'list_service_instances')
         if r.status_code == HTTPStatus.OK:
             if is_ok(r.json()):
@@ -337,5 +335,5 @@ class ServiceDeployer(object):
             else:
                 raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
         else:
-            r.raise_for_status()
+            raise_for_http_status(r)
         return None
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index 47994bc6..bfb55e6d 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -13,6 +13,10 @@ class NotSupportError(Exception):
     pass
 
 
+class NoValidRevisionError(Exception):
+    pass
+
+
 class NotExistError(Exception):
     pass
 
@@ -99,3 +103,33 @@ def datahub_raise_on_error(url, rsp):
         raise RequestError(
             f"Url = {url}, Status = {rsp.get('status')}, error = {rsp.get('error')}, message = {rsp.get('message')}"
         )
+
+
+def raise_for_http_status(rsp):
+    """
+    Attempt to decode utf-8 first since some servers
+    localize reason strings, for invalid utf-8, fall back
+    to decoding with iso-8859-1.
+    """
+    http_error_msg = ''
+    if isinstance(rsp.reason, bytes):
+        try:
+            reason = rsp.reason.decode('utf-8')
+        except UnicodeDecodeError:
+            reason = rsp.reason.decode('iso-8859-1')
+    else:
+        reason = rsp.reason
+
+    if 400 <= rsp.status_code < 500:
+        http_error_msg = u'%s Client Error: %s for url: %s' % (rsp.status_code,
+                                                               reason, rsp.url)
+
+    elif 500 <= rsp.status_code < 600:
+        http_error_msg = u'%s Server Error: %s for url: %s' % (rsp.status_code,
+                                                               reason, rsp.url)
+
+    if http_error_msg:
+        req = rsp.request
+        if req.method == 'POST':
+            http_error_msg = u'%s, body: %s' % (http_error_msg, req.body)
+        raise HTTPError(http_error_msg, response=rsp)
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 8ffc60bc..042ea6a6 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -2,13 +2,11 @@
 
 import copy
 import os
-import sys
 import tempfile
 from functools import partial
 from http.cookiejar import CookieJar
 from pathlib import Path
 from typing import Dict, Optional, Union
-from uuid import uuid4
 
 import requests
 from tqdm import tqdm
@@ -23,7 +21,6 @@ from .utils.caching import ModelFileSystemCache
 from .utils.utils import (file_integrity_validation, get_cache_dir,
                           get_endpoint, model_id_to_group_owner_name)
 
-SESSION_ID = uuid4().hex
 logger = get_logger()
 
 
@@ -34,6 +31,7 @@ def model_file_download(
     cache_dir: Optional[str] = None,
     user_agent: Union[Dict, str, None] = None,
     local_files_only: Optional[bool] = False,
+    cookies: Optional[CookieJar] = None,
 ) -> Optional[str]:  # pragma: no cover
     """
     Download from a given URL and cache it if it's not already present in the
@@ -104,54 +102,47 @@ def model_file_download(
                 " online, set 'local_files_only' to False.")
 
     _api = HubApi()
-    headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
-    cookies = ModelScopeConfig.get_cookies()
-    branches, tags = _api.get_model_branches_and_tags(
-        model_id, use_cookies=False if cookies is None else cookies)
+    headers = {
+        'user-agent': ModelScopeConfig.get_user_agent(user_agent=user_agent, )
+    }
+    if cookies is None:
+        cookies = ModelScopeConfig.get_cookies()
+
+    revision = _api.get_valid_revision(
+        model_id, revision=revision, cookies=cookies)
     file_to_download_info = None
-    is_commit_id = False
-    if revision in branches or revision in tags:  # The revision is version or tag,
-        # we need to confirm the version is up to date
-        # we need to get the file list to check if the lateast version is cached, if so return, otherwise download
-        model_files = _api.get_model_files(
-            model_id=model_id,
-            revision=revision,
-            recursive=True,
-            use_cookies=False if cookies is None else cookies)
-
-        for model_file in model_files:
-            if model_file['Type'] == 'tree':
-                continue
-
-            if model_file['Path'] == file_path:
-                if cache.exists(model_file):
-                    return cache.get_file_by_info(model_file)
-                else:
-                    file_to_download_info = model_file
-                break
-
-        if file_to_download_info is None:
-            raise NotExistError('The file path: %s not exist in: %s' %
-                                (file_path, model_id))
-    else:  # the revision is commit id.
-        cached_file_path = cache.get_file_by_path_and_commit_id(
-            file_path, revision)
-        if cached_file_path is not None:
-            file_name = os.path.basename(cached_file_path)
-            logger.info(
-                f'File {file_name} already in cache, skip downloading!')
-            return cached_file_path  # the file is in cache.
-        is_commit_id = True
+    # we need to confirm the version is up-to-date
+    # we need to get the file list to check if the latest version is cached, if so return, otherwise download
+    model_files = _api.get_model_files(
+        model_id=model_id,
+        revision=revision,
+        recursive=True,
+        use_cookies=False if cookies is None else cookies)
+
+    for model_file in model_files:
+        if model_file['Type'] == 'tree':
+            continue
+
+        if model_file['Path'] == file_path:
+            if cache.exists(model_file):
+                logger.info(
+                    f'File {model_file["Name"]} already in cache, skip downloading!'
+                )
+                return cache.get_file_by_info(model_file)
+            else:
+                file_to_download_info = model_file
+            break
+
+    if file_to_download_info is None:
+        raise NotExistError('The file path: %s not exist in: %s' %
+                            (file_path, model_id))
+
     # we need to download again
     url_to_download = get_file_download_url(model_id, file_path, revision)
     file_to_download_info = {
-        'Path':
-        file_path,
-        'Revision':
-        revision if is_commit_id else file_to_download_info['Revision'],
-        FILE_HASH:
-        None if (is_commit_id or FILE_HASH not in file_to_download_info) else
-        file_to_download_info[FILE_HASH]
+        'Path': file_path,
+        'Revision': file_to_download_info['Revision'],
+        FILE_HASH: file_to_download_info[FILE_HASH]
     }
 
     temp_file_name = next(tempfile._get_candidate_names())
@@ -170,25 +161,6 @@ def model_file_download(
                           os.path.join(temporary_cache_dir, temp_file_name))
 
 
-def http_user_agent(user_agent: Union[Dict, str, None] = None, ) -> str:
-    """Formats a user-agent string with basic info about a request.
-
-    Args:
-        user_agent (`str`, `dict`, *optional*):
-            The user agent info in the form of a dictionary or a single string.
-
-    Returns:
-        The formatted user-agent string.
-    """
-    ua = f'modelscope/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}'
-
-    if isinstance(user_agent, dict):
-        ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua = user_agent
-    return ua
-
-
 def get_file_download_url(model_id: str, file_path: str, revision: str):
     """
     Format file download url according to `model_id`, `revision` and `file_path`.
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index fe1d1554..7943023b 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -5,6 +5,7 @@ import subprocess
 from typing import List
 
 from modelscope.utils.logger import get_logger
+from ..utils.constant import MASTER_MODEL_BRANCH
 from .errors import GitError
 
 logger = get_logger()
@@ -227,3 +228,22 @@ class GitCommandWrapper(metaclass=Singleton):
             files.append(line.split(' ')[-1])
 
         return files
+
+    def tag(self,
+            repo_dir: str,
+            tag_name: str,
+            message: str,
+            ref: str = MASTER_MODEL_BRANCH):
+        cmd_args = [
+            '-C', repo_dir, 'tag', tag_name, '-m',
+            '"%s"' % message, ref
+        ]
+        rsp = self._run_git_command(*cmd_args)
+        logger.debug(rsp.stdout.decode('utf8'))
+        return rsp
+
+    def push_tag(self, repo_dir: str, tag_name):
+        cmd_args = ['-C', repo_dir, 'push', 'origin', tag_name]
+        rsp = self._run_git_command(*cmd_args)
+        logger.debug(rsp.stdout.decode('utf8'))
+        return rsp
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index 35c831a9..6b116f79 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -5,7 +5,8 @@ from typing import Optional
 
 from modelscope.hub.errors import GitError, InvalidParameter, NotLoginException
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
-                                       DEFAULT_MODEL_REVISION)
+                                       DEFAULT_REPOSITORY_REVISION,
+                                       MASTER_MODEL_BRANCH)
 from modelscope.utils.logger import get_logger
 from .git import GitCommandWrapper
 from .utils.utils import get_endpoint
@@ -20,7 +21,7 @@ class Repository:
     def __init__(self,
                  model_dir: str,
                  clone_from: str,
-                 revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
                  auth_token: Optional[str] = None,
                  git_path: Optional[str] = None):
         """
@@ -89,7 +90,8 @@ class Repository:
 
     def push(self,
              commit_message: str,
-             branch: Optional[str] = DEFAULT_MODEL_REVISION,
+             local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
+             remote_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
              force: bool = False):
         """Push local files to remote, this method will do.
            git pull
@@ -116,14 +118,48 @@ class Repository:
 
         url = self.git_wrapper.get_repo_remote_url(self.model_dir)
         self.git_wrapper.pull(self.model_dir)
+
         self.git_wrapper.add(self.model_dir, all_files=True)
         self.git_wrapper.commit(self.model_dir, commit_message)
         self.git_wrapper.push(
             repo_dir=self.model_dir,
             token=self.auth_token,
             url=url,
-            local_branch=branch,
-            remote_branch=branch)
+            local_branch=local_branch,
+            remote_branch=remote_branch)
+
+    def tag(self, tag_name: str, message: str, ref: str = MASTER_MODEL_BRANCH):
+        """Create a new tag.
+        Args:
+            tag_name (str): The name of the tag
+            message (str): The tag message.
+            ref (str): The tag reference, can be commit id or branch.
+        """
+        if tag_name is None or tag_name == '':
+            msg = 'We use tag-based revision, therefore tag_name cannot be None or empty.'
+            raise InvalidParameter(msg)
+        if message is None or message == '':
+            msg = 'We use annotated tag, therefore message cannot None or empty.'
+        self.git_wrapper.tag(
+            repo_dir=self.model_dir,
+            tag_name=tag_name,
+            message=message,
+            ref=ref)
+
+    def tag_and_push(self,
+                     tag_name: str,
+                     message: str,
+                     ref: str = MASTER_MODEL_BRANCH):
+        """Create tag and push to remote
+
+        Args:
+            tag_name (str): The name of the tag
+            message (str): The tag message.
+            ref (str, optional): The tag ref, can be commit id or branch. Defaults to MASTER_MODEL_BRANCH.
+        """
+        self.tag(tag_name, message, ref)
+
+        self.git_wrapper.push_tag(repo_dir=self.model_dir, tag_name=tag_name)
 
 
 class DatasetRepository:
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index ac57d1b1..4b81de44 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -2,6 +2,7 @@
 
 import os
 import tempfile
+from http.cookiejar import CookieJar
 from pathlib import Path
 from typing import Dict, Optional, Union
 
@@ -9,9 +10,7 @@ from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
 from modelscope.utils.logger import get_logger
 from .constants import FILE_HASH
-from .errors import NotExistError
-from .file_download import (get_file_download_url, http_get_file,
-                            http_user_agent)
+from .file_download import get_file_download_url, http_get_file
 from .utils.caching import ModelFileSystemCache
 from .utils.utils import (file_integrity_validation, get_cache_dir,
                           model_id_to_group_owner_name)
@@ -23,7 +22,8 @@ def snapshot_download(model_id: str,
                       revision: Optional[str] = DEFAULT_MODEL_REVISION,
                       cache_dir: Union[str, Path, None] = None,
                       user_agent: Optional[Union[Dict, str]] = None,
-                      local_files_only: Optional[bool] = False) -> str:
+                      local_files_only: Optional[bool] = False,
+                      cookies: Optional[CookieJar] = None) -> str:
     """Download all files of a repo.
     Downloads a whole snapshot of a repo's files at the specified revision. This
     is useful when you want all files from a repo, because you don't know which
@@ -81,15 +81,15 @@ def snapshot_download(model_id: str,
         )  # we can not confirm the cached file is for snapshot 'revision'
     else:
         # make headers
-        headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
+        headers = {
+            'user-agent':
+            ModelScopeConfig.get_user_agent(user_agent=user_agent, )
+        }
         _api = HubApi()
-        cookies = ModelScopeConfig.get_cookies()
-        # get file list from model repo
-        branches, tags = _api.get_model_branches_and_tags(
-            model_id, use_cookies=False if cookies is None else cookies)
-        if revision not in branches and revision not in tags:
-            raise NotExistError('The specified branch or tag : %s not exist!'
-                                % revision)
+        if cookies is None:
+            cookies = ModelScopeConfig.get_cookies()
+        revision = _api.get_valid_revision(
+            model_id, revision=revision, cookies=cookies)
 
         snapshot_header = headers if 'CI_TEST' in os.environ else {
             **headers,
@@ -110,7 +110,7 @@ def snapshot_download(model_id: str,
             for model_file in model_files:
                 if model_file['Type'] == 'tree':
                     continue
-                # check model_file is exist in cache, if exist, skip download, otherwise download
+                # check model_file is exist in cache, if existed, skip download, otherwise download
                 if cache.exists(model_file):
                     file_name = os.path.basename(model_file['Name'])
                     logger.info(
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 7d3c2499..a54f3413 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -2,11 +2,12 @@
 
 import hashlib
 import os
+from datetime import datetime
 from typing import Optional
 
 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                       DEFAULT_MODELSCOPE_GROUP,
-                                      MODEL_ID_SEPARATOR,
+                                      MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG,
                                       MODELSCOPE_URL_SCHEME)
 from modelscope.hub.errors import FileIntegrityError
 from modelscope.utils.file_utils import get_default_cache_dir
@@ -37,6 +38,18 @@ def get_cache_dir(model_id: Optional[str] = None):
         base_path, model_id + '/')
 
 
+def get_release_datetime():
+    if MODELSCOPE_SDK_DEBUG in os.environ:
+        rt = int(round(datetime.now().timestamp()))
+    else:
+        from modelscope import version
+        rt = int(
+            round(
+                datetime.strptime(version.__release_datetime__,
+                                  '%Y-%m-%d %H:%M:%S').timestamp()))
+    return rt
+
+
 def get_endpoint():
     modelscope_domain = os.getenv('MODELSCOPE_DOMAIN',
                                   DEFAULT_MODELSCOPE_DOMAIN)
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index cf055d6d..ad900bab 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -675,4 +675,8 @@ class MsDataset:
             revision=revision,
             auth_token=auth_token,
             git_path=git_path)
-        _repo.push(commit_message=commit_message, branch=revision, force=force)
+        _repo.push(
+            commit_message=commit_message,
+            local_branch=revision,
+            remote_branch=revision,
+            force=force)
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 45bda324..6394ad8a 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -311,7 +311,9 @@ class Frameworks(object):
     kaldi = 'kaldi'
 
 
-DEFAULT_MODEL_REVISION = 'master'
+DEFAULT_MODEL_REVISION = None
+MASTER_MODEL_BRANCH = 'master'
+DEFAULT_REPOSITORY_REVISION = 'master'
 DEFAULT_DATASET_REVISION = 'master'
 DEFAULT_DATASET_NAMESPACE = 'modelscope'
 
diff --git a/modelscope/version.py b/modelscope/version.py
index 2b8877c5..541dfc57 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1,5 @@
+# Make sure to modify __release_datetime__ to release time when making official release.
 __version__ = '0.5.0'
+# default release datetime for branches under active development is set
+# to be a time far-far-away-into-the-future
+__release_datetime__ = '2099-10-13 08:56:12'
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
index f2bdb2d3..828b97f8 100644
--- a/tests/hub/test_hub_operation.py
+++ b/tests/hub/test_hub_operation.py
@@ -25,10 +25,10 @@ class HubOperationTest(unittest.TestCase):
 
     def setUp(self):
         self.api = HubApi()
-        # note this is temporary before official account management is ready
         self.api.login(TEST_ACCESS_TOKEN1)
-        self.model_name = uuid.uuid4().hex
+        self.model_name = 'op-%s' % (uuid.uuid4().hex)
         self.model_id = '%s/%s' % (TEST_MODEL_ORG, self.model_name)
+        self.revision = 'v0.1_test_revision'
         self.api.create_model(
             model_id=self.model_id,
             visibility=ModelVisibility.PUBLIC,
@@ -46,6 +46,7 @@ class HubOperationTest(unittest.TestCase):
         os.system("echo 'testtest'>%s"
                   % os.path.join(self.model_dir, download_model_file_name))
         repo.push('add model')
+        repo.tag_and_push(self.revision, 'Test revision')
 
     def test_model_repo_creation(self):
         # change to proper model names before use
@@ -61,7 +62,9 @@ class HubOperationTest(unittest.TestCase):
     def test_download_single_file(self):
         self.prepare_case()
         downloaded_file = model_file_download(
-            model_id=self.model_id, file_path=download_model_file_name)
+            model_id=self.model_id,
+            file_path=download_model_file_name,
+            revision=self.revision)
         assert os.path.exists(downloaded_file)
         mdtime1 = os.path.getmtime(downloaded_file)
         # download again
@@ -78,17 +81,16 @@ class HubOperationTest(unittest.TestCase):
         assert os.path.exists(downloaded_file_path)
         mdtime1 = os.path.getmtime(downloaded_file_path)
         # download again
-        snapshot_path = snapshot_download(model_id=self.model_id)
+        snapshot_path = snapshot_download(
+            model_id=self.model_id, revision=self.revision)
         mdtime2 = os.path.getmtime(downloaded_file_path)
         assert mdtime1 == mdtime2
-        model_file_download(
-            model_id=self.model_id,
-            file_path=download_model_file_name)  # not add counter
 
     def test_download_public_without_login(self):
         self.prepare_case()
         rmtree(ModelScopeConfig.path_credential)
-        snapshot_path = snapshot_download(model_id=self.model_id)
+        snapshot_path = snapshot_download(
+            model_id=self.model_id, revision=self.revision)
         downloaded_file_path = os.path.join(snapshot_path,
                                             download_model_file_name)
         assert os.path.exists(downloaded_file_path)
@@ -96,26 +98,38 @@ class HubOperationTest(unittest.TestCase):
         downloaded_file = model_file_download(
             model_id=self.model_id,
             file_path=download_model_file_name,
+            revision=self.revision,
             cache_dir=temporary_dir)
         assert os.path.exists(downloaded_file)
         self.api.login(TEST_ACCESS_TOKEN1)
 
     def test_snapshot_delete_download_cache_file(self):
         self.prepare_case()
-        snapshot_path = snapshot_download(model_id=self.model_id)
+        snapshot_path = snapshot_download(
+            model_id=self.model_id, revision=self.revision)
         downloaded_file_path = os.path.join(snapshot_path,
                                             download_model_file_name)
         assert os.path.exists(downloaded_file_path)
         os.remove(downloaded_file_path)
         # download again in cache
         file_download_path = model_file_download(
-            model_id=self.model_id, file_path=ModelFile.README)
+            model_id=self.model_id,
+            file_path=ModelFile.README,
+            revision=self.revision)
         assert os.path.exists(file_download_path)
         # deleted file need download again
         file_download_path = model_file_download(
-            model_id=self.model_id, file_path=download_model_file_name)
+            model_id=self.model_id,
+            file_path=download_model_file_name,
+            revision=self.revision)
         assert os.path.exists(file_download_path)
 
+    def test_snapshot_download_default_revision(self):
+        pass  # TOTO
+
+    def test_file_download_default_revision(self):
+        pass  # TODO
+
     def get_model_download_times(self):
         url = f'{self.api.endpoint}/api/v1/models/{self.model_id}/downloads'
         cookies = ModelScopeConfig.get_cookies()
diff --git a/tests/hub/test_hub_private_files.py b/tests/hub/test_hub_private_files.py
index d19a7c64..73c4cca3 100644
--- a/tests/hub/test_hub_private_files.py
+++ b/tests/hub/test_hub_private_files.py
@@ -17,23 +17,34 @@ from .test_utils import (TEST_ACCESS_TOKEN1, TEST_ACCESS_TOKEN2,
                          TEST_MODEL_CHINESE_NAME, TEST_MODEL_ORG,
                          delete_credential)
 
+download_model_file_name = 'test.bin'
+
 
 class HubPrivateFileDownloadTest(unittest.TestCase):
 
     def setUp(self):
         self.old_cwd = os.getcwd()
         self.api = HubApi()
-        # note this is temporary before official account management is ready
         self.token, _ = self.api.login(TEST_ACCESS_TOKEN1)
-        self.model_name = uuid.uuid4().hex
+        self.model_name = 'pf-%s' % (uuid.uuid4().hex)
         self.model_id = '%s/%s' % (TEST_MODEL_ORG, self.model_name)
+        self.revision = 'v0.1_test_revision'
         self.api.create_model(
             model_id=self.model_id,
-            visibility=ModelVisibility.PRIVATE,  # 1-private, 5-public
+            visibility=ModelVisibility.PRIVATE,
             license=Licenses.APACHE_V2,
             chinese_name=TEST_MODEL_CHINESE_NAME,
         )
 
+    def prepare_case(self):
+        temporary_dir = tempfile.mkdtemp()
+        self.model_dir = os.path.join(temporary_dir, self.model_name)
+        repo = Repository(self.model_dir, clone_from=self.model_id)
+        os.system("echo 'testtest'>%s"
+                  % os.path.join(self.model_dir, download_model_file_name))
+        repo.push('add model')
+        repo.tag_and_push(self.revision, 'Test revision')
+
     def tearDown(self):
         # credential may deleted or switch login name, we need re-login here
         # to ensure the temporary model is deleted.
@@ -42,49 +53,67 @@ class HubPrivateFileDownloadTest(unittest.TestCase):
         self.api.delete_model(model_id=self.model_id)
 
     def test_snapshot_download_private_model(self):
-        snapshot_path = snapshot_download(self.model_id)
+        self.prepare_case()
+        snapshot_path = snapshot_download(self.model_id, self.revision)
         assert os.path.exists(os.path.join(snapshot_path, ModelFile.README))
 
     def test_snapshot_download_private_model_no_permission(self):
+        self.prepare_case()
         self.token, _ = self.api.login(TEST_ACCESS_TOKEN2)
         with self.assertRaises(HTTPError):
-            snapshot_download(self.model_id)
+            snapshot_download(self.model_id, self.revision)
 
     def test_snapshot_download_private_model_without_login(self):
+        self.prepare_case()
         delete_credential()
         with self.assertRaises(HTTPError):
-            snapshot_download(self.model_id)
+            snapshot_download(self.model_id, self.revision)
 
     def test_download_file_private_model(self):
-        file_path = model_file_download(self.model_id, ModelFile.README)
+        self.prepare_case()
+        file_path = model_file_download(self.model_id, ModelFile.README,
+                                        self.revision)
         assert os.path.exists(file_path)
 
     def test_download_file_private_model_no_permission(self):
+        self.prepare_case()
         self.token, _ = self.api.login(TEST_ACCESS_TOKEN2)
         with self.assertRaises(HTTPError):
-            model_file_download(self.model_id, ModelFile.README)
+            model_file_download(self.model_id, ModelFile.README, self.revision)
 
     def test_download_file_private_model_without_login(self):
+        self.prepare_case()
         delete_credential()
         with self.assertRaises(HTTPError):
-            model_file_download(self.model_id, ModelFile.README)
+            model_file_download(self.model_id, ModelFile.README, self.revision)
 
     def test_snapshot_download_local_only(self):
+        self.prepare_case()
         with self.assertRaises(ValueError):
-            snapshot_download(self.model_id, local_files_only=True)
-        snapshot_path = snapshot_download(self.model_id)
+            snapshot_download(
+                self.model_id, self.revision, local_files_only=True)
+        snapshot_path = snapshot_download(self.model_id, self.revision)
         assert os.path.exists(os.path.join(snapshot_path, ModelFile.README))
-        snapshot_path = snapshot_download(self.model_id, local_files_only=True)
+        snapshot_path = snapshot_download(
+            self.model_id, self.revision, local_files_only=True)
         assert os.path.exists(snapshot_path)
 
     def test_file_download_local_only(self):
+        self.prepare_case()
         with self.assertRaises(ValueError):
             model_file_download(
-                self.model_id, ModelFile.README, local_files_only=True)
-        file_path = model_file_download(self.model_id, ModelFile.README)
+                self.model_id,
+                ModelFile.README,
+                self.revision,
+                local_files_only=True)
+        file_path = model_file_download(self.model_id, ModelFile.README,
+                                        self.revision)
         assert os.path.exists(file_path)
         file_path = model_file_download(
-            self.model_id, ModelFile.README, local_files_only=True)
+            self.model_id,
+            ModelFile.README,
+            revision=self.revision,
+            local_files_only=True)
         assert os.path.exists(file_path)
 
 
diff --git a/tests/hub/test_hub_private_repository.py b/tests/hub/test_hub_private_repository.py
index dab2b891..271a715c 100644
--- a/tests/hub/test_hub_private_repository.py
+++ b/tests/hub/test_hub_private_repository.py
@@ -21,13 +21,12 @@ class HubPrivateRepositoryTest(unittest.TestCase):
     def setUp(self):
         self.old_cwd = os.getcwd()
         self.api = HubApi()
-        # note this is temporary before official account management is ready
         self.token, _ = self.api.login(TEST_ACCESS_TOKEN1)
-        self.model_name = uuid.uuid4().hex
+        self.model_name = 'pr-%s' % (uuid.uuid4().hex)
         self.model_id = '%s/%s' % (TEST_MODEL_ORG, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
-            visibility=ModelVisibility.PRIVATE,  # 1-private, 5-public
+            visibility=ModelVisibility.PRIVATE,
             license=Licenses.APACHE_V2,
             chinese_name=TEST_MODEL_CHINESE_NAME,
         )
diff --git a/tests/hub/test_hub_repository.py b/tests/hub/test_hub_repository.py
index 9dfe8efd..850d5840 100644
--- a/tests/hub/test_hub_repository.py
+++ b/tests/hub/test_hub_repository.py
@@ -22,6 +22,7 @@ from .test_utils import (TEST_ACCESS_TOKEN1, TEST_MODEL_CHINESE_NAME,
 logger = get_logger()
 logger.setLevel('DEBUG')
 DEFAULT_GIT_PATH = 'git'
+download_model_file_name = 'test.bin'
 
 
 class HubRepositoryTest(unittest.TestCase):
@@ -29,13 +30,13 @@ class HubRepositoryTest(unittest.TestCase):
     def setUp(self):
         self.old_cwd = os.getcwd()
         self.api = HubApi()
-        # note this is temporary before official account management is ready
         self.api.login(TEST_ACCESS_TOKEN1)
-        self.model_name = uuid.uuid4().hex
+        self.model_name = 'repo-%s' % (uuid.uuid4().hex)
         self.model_id = '%s/%s' % (TEST_MODEL_ORG, self.model_name)
+        self.revision = 'v0.1_test_revision'
         self.api.create_model(
             model_id=self.model_id,
-            visibility=ModelVisibility.PUBLIC,  # 1-private, 5-public
+            visibility=ModelVisibility.PUBLIC,
             license=Licenses.APACHE_V2,
             chinese_name=TEST_MODEL_CHINESE_NAME,
         )
@@ -67,9 +68,10 @@ class HubRepositoryTest(unittest.TestCase):
         os.system("echo 'lfs'>%s" % os.path.join(self.model_dir, lfs_file1))
         os.system("echo 'lfs2'>%s" % os.path.join(self.model_dir, lfs_file2))
         repo.push('test')
-        add1 = model_file_download(self.model_id, 'add1.py')
+        repo.tag_and_push(self.revision, 'Test revision')
+        add1 = model_file_download(self.model_id, 'add1.py', self.revision)
         assert os.path.exists(add1)
-        add2 = model_file_download(self.model_id, 'add2.py')
+        add2 = model_file_download(self.model_id, 'add2.py', self.revision)
         assert os.path.exists(add2)
         # check lfs files.
         git_wrapper = GitCommandWrapper()
diff --git a/tests/hub/test_hub_revision.py b/tests/hub/test_hub_revision.py
new file mode 100644
index 00000000..13ec1c9a
--- /dev/null
+++ b/tests/hub/test_hub_revision.py
@@ -0,0 +1,145 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import tempfile
+import unittest
+import uuid
+from datetime import datetime
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.errors import NotExistError, NoValidRevisionError
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.repository import Repository
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from .test_utils import (TEST_ACCESS_TOKEN1, TEST_MODEL_CHINESE_NAME,
+                         TEST_MODEL_ORG)
+
+logger = get_logger()
+logger.setLevel('DEBUG')
+download_model_file_name = 'test.bin'
+download_model_file_name2 = 'test2.bin'
+
+
+class HubRevisionTest(unittest.TestCase):
+
+    def setUp(self):
+        self.api = HubApi()
+        self.api.login(TEST_ACCESS_TOKEN1)
+        self.model_name = 'rv-%s' % (uuid.uuid4().hex)
+        self.model_id = '%s/%s' % (TEST_MODEL_ORG, self.model_name)
+        self.revision = 'v0.1_test_revision'
+        self.revision2 = 'v0.2_test_revision'
+        self.api.create_model(
+            model_id=self.model_id,
+            visibility=ModelVisibility.PUBLIC,
+            license=Licenses.APACHE_V2,
+            chinese_name=TEST_MODEL_CHINESE_NAME,
+        )
+
+    def tearDown(self):
+        self.api.delete_model(model_id=self.model_id)
+
+    def prepare_repo_data(self):
+        temporary_dir = tempfile.mkdtemp()
+        self.model_dir = os.path.join(temporary_dir, self.model_name)
+        self.repo = Repository(self.model_dir, clone_from=self.model_id)
+        os.system("echo 'testtest'>%s"
+                  % os.path.join(self.model_dir, download_model_file_name))
+        self.repo.push('add model')
+        self.repo.tag_and_push(self.revision, 'Test revision')
+
+    def test_no_tag(self):
+        with self.assertRaises(NoValidRevisionError):
+            snapshot_download(self.model_id, None)
+
+        with self.assertRaises(NoValidRevisionError):
+            model_file_download(self.model_id, ModelFile.README)
+
+    def test_with_only_one_tag(self):
+        self.prepare_repo_data()
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            snapshot_path = snapshot_download(
+                self.model_id, cache_dir=temp_cache_dir)
+            assert os.path.exists(
+                os.path.join(snapshot_path, download_model_file_name))
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            file_path = model_file_download(
+                self.model_id, ModelFile.README, cache_dir=temp_cache_dir)
+            assert os.path.exists(file_path)
+
+    def add_new_file_and_tag(self):
+        os.system("echo 'testtest'>%s"
+                  % os.path.join(self.model_dir, download_model_file_name2))
+        self.repo.push('add new file')
+        self.repo.tag_and_push(self.revision2, 'Test revision')
+
+    def test_snapshot_download_different_revision(self):
+        self.prepare_repo_data()
+        t1 = datetime.now().isoformat(sep=' ', timespec='seconds')
+        logger.info('First time stamp: %s' % t1)
+        snapshot_path = snapshot_download(self.model_id, self.revision)
+        assert os.path.exists(
+            os.path.join(snapshot_path, download_model_file_name))
+        self.add_new_file_and_tag()
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            snapshot_path = snapshot_download(
+                self.model_id,
+                revision=self.revision,
+                cache_dir=temp_cache_dir)
+            assert os.path.exists(
+                os.path.join(snapshot_path, download_model_file_name))
+            assert not os.path.exists(
+                os.path.join(snapshot_path, download_model_file_name2))
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            snapshot_path = snapshot_download(
+                self.model_id,
+                revision=self.revision2,
+                cache_dir=temp_cache_dir)
+            assert os.path.exists(
+                os.path.join(snapshot_path, download_model_file_name))
+            assert os.path.exists(
+                os.path.join(snapshot_path, download_model_file_name2))
+
+    def test_file_download_different_revision(self):
+        self.prepare_repo_data()
+        t1 = datetime.now().isoformat(sep=' ', timespec='seconds')
+        logger.info('First time stamp: %s' % t1)
+        file_path = model_file_download(self.model_id,
+                                        download_model_file_name,
+                                        self.revision)
+        assert os.path.exists(file_path)
+        self.add_new_file_and_tag()
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            file_path = model_file_download(
+                self.model_id,
+                download_model_file_name,
+                revision=self.revision,
+                cache_dir=temp_cache_dir)
+            assert os.path.exists(file_path)
+            with self.assertRaises(NotExistError):
+                model_file_download(
+                    self.model_id,
+                    download_model_file_name2,
+                    revision=self.revision,
+                    cache_dir=temp_cache_dir)
+
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            file_path = model_file_download(
+                self.model_id,
+                download_model_file_name,
+                revision=self.revision2,
+                cache_dir=temp_cache_dir)
+            print('Downloaded file path: %s' % file_path)
+            assert os.path.exists(file_path)
+            file_path = model_file_download(
+                self.model_id,
+                download_model_file_name2,
+                revision=self.revision2,
+                cache_dir=temp_cache_dir)
+            assert os.path.exists(file_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_revision_release_mode.py b/tests/hub/test_hub_revision_release_mode.py
new file mode 100644
index 00000000..729a1861
--- /dev/null
+++ b/tests/hub/test_hub_revision_release_mode.py
@@ -0,0 +1,190 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import tempfile
+import time
+import unittest
+import uuid
+from datetime import datetime
+from unittest import mock
+
+from modelscope import version
+from modelscope.hub.api import HubApi
+from modelscope.hub.constants import (MODELSCOPE_SDK_DEBUG, Licenses,
+                                      ModelVisibility)
+from modelscope.hub.errors import NotExistError
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.repository import Repository
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.utils.logger import get_logger
+from .test_utils import (TEST_ACCESS_TOKEN1, TEST_MODEL_CHINESE_NAME,
+                         TEST_MODEL_ORG)
+
+logger = get_logger()
+logger.setLevel('DEBUG')
+download_model_file_name = 'test.bin'
+download_model_file_name2 = 'test2.bin'
+
+
+class HubRevisionTest(unittest.TestCase):
+
+    def setUp(self):
+        self.api = HubApi()
+        self.api.login(TEST_ACCESS_TOKEN1)
+        self.model_name = 'rvr-%s' % (uuid.uuid4().hex)
+        self.model_id = '%s/%s' % (TEST_MODEL_ORG, self.model_name)
+        self.revision = 'v0.1_test_revision'
+        self.revision2 = 'v0.2_test_revision'
+        self.api.create_model(
+            model_id=self.model_id,
+            visibility=ModelVisibility.PUBLIC,
+            license=Licenses.APACHE_V2,
+            chinese_name=TEST_MODEL_CHINESE_NAME,
+        )
+        names_to_remove = {MODELSCOPE_SDK_DEBUG}
+        self.modified_environ = {
+            k: v
+            for k, v in os.environ.items() if k not in names_to_remove
+        }
+
+    def tearDown(self):
+        self.api.delete_model(model_id=self.model_id)
+
+    def prepare_repo_data(self):
+        temporary_dir = tempfile.mkdtemp()
+        self.model_dir = os.path.join(temporary_dir, self.model_name)
+        self.repo = Repository(self.model_dir, clone_from=self.model_id)
+        os.system("echo 'testtest'>%s"
+                  % os.path.join(self.model_dir, download_model_file_name))
+        self.repo.push('add model')
+
+    def prepare_repo_data_and_tag(self):
+        self.prepare_repo_data()
+        self.repo.tag_and_push(self.revision, 'Test revision')
+
+    def add_new_file_and_tag_to_repo(self):
+        os.system("echo 'testtest'>%s"
+                  % os.path.join(self.model_dir, download_model_file_name2))
+        self.repo.push('add new file')
+        self.repo.tag_and_push(self.revision2, 'Test revision')
+
+    def add_new_file_and_branch_to_repo(self, branch_name):
+        os.system("echo 'testtest'>%s"
+                  % os.path.join(self.model_dir, download_model_file_name2))
+        self.repo.push('add new file', remote_branch=branch_name)
+
+    def test_dev_mode_default_master(self):
+        with mock.patch.dict(os.environ, self.modified_environ, clear=True):
+            self.prepare_repo_data()  # no tag, default get master
+            with tempfile.TemporaryDirectory() as temp_cache_dir:
+                snapshot_path = snapshot_download(
+                    self.model_id, cache_dir=temp_cache_dir)
+                assert os.path.exists(
+                    os.path.join(snapshot_path, download_model_file_name))
+            with tempfile.TemporaryDirectory() as temp_cache_dir:
+                file_path = model_file_download(
+                    self.model_id,
+                    download_model_file_name,
+                    cache_dir=temp_cache_dir)
+                assert os.path.exists(file_path)
+
+    def test_dev_mode_specify_branch(self):
+        with mock.patch.dict(os.environ, self.modified_environ, clear=True):
+            self.prepare_repo_data()  # no tag, default get master
+            branch_name = 'test'
+            self.add_new_file_and_branch_to_repo(branch_name)
+            with tempfile.TemporaryDirectory() as temp_cache_dir:
+                snapshot_path = snapshot_download(
+                    self.model_id,
+                    revision=branch_name,
+                    cache_dir=temp_cache_dir)
+                assert os.path.exists(
+                    os.path.join(snapshot_path, download_model_file_name))
+            with tempfile.TemporaryDirectory() as temp_cache_dir:
+                file_path = model_file_download(
+                    self.model_id,
+                    download_model_file_name,
+                    revision=branch_name,
+                    cache_dir=temp_cache_dir)
+                assert os.path.exists(file_path)
+
+    def test_snapshot_download_revision(self):
+        with mock.patch.dict(os.environ, self.modified_environ, clear=True):
+            self.prepare_repo_data_and_tag()
+            t1 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('First time: %s' % t1)
+            time.sleep(10)
+            self.add_new_file_and_tag_to_repo()
+            t2 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('Secnod time: %s' % t2)
+            # set
+            release_datetime_backup = version.__release_datetime__
+            logger.info('Origin __release_datetime__: %s'
+                        % version.__release_datetime__)
+            try:
+                logger.info('Setting __release_datetime__ to: %s' % t1)
+                version.__release_datetime__ = t1
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    snapshot_path = snapshot_download(
+                        self.model_id, cache_dir=temp_cache_dir)
+                    assert os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name))
+                    assert not os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name2))
+                version.__release_datetime__ = t2
+                logger.info('Setting __release_datetime__ to: %s' % t2)
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    snapshot_path = snapshot_download(
+                        self.model_id, cache_dir=temp_cache_dir)
+                    assert os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name))
+                    assert os.path.exists(
+                        os.path.join(snapshot_path, download_model_file_name2))
+            finally:
+                version.__release_datetime__ = release_datetime_backup
+
+    def test_file_download_revision(self):
+        with mock.patch.dict(os.environ, self.modified_environ, clear=True):
+            self.prepare_repo_data_and_tag()
+            t1 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('First time stamp: %s' % t1)
+            time.sleep(10)
+            self.add_new_file_and_tag_to_repo()
+            t2 = datetime.now().isoformat(sep=' ', timespec='seconds')
+            logger.info('Second time: %s' % t2)
+            release_datetime_backup = version.__release_datetime__
+            logger.info('Origin __release_datetime__: %s'
+                        % version.__release_datetime__)
+            try:
+                version.__release_datetime__ = t1
+                logger.info('Setting __release_datetime__ to: %s' % t1)
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    file_path = model_file_download(
+                        self.model_id,
+                        download_model_file_name,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(file_path)
+                    with self.assertRaises(NotExistError):
+                        model_file_download(
+                            self.model_id,
+                            download_model_file_name2,
+                            cache_dir=temp_cache_dir)
+                version.__release_datetime__ = t2
+                logger.info('Setting __release_datetime__ to: %s' % t2)
+                with tempfile.TemporaryDirectory() as temp_cache_dir:
+                    file_path = model_file_download(
+                        self.model_id,
+                        download_model_file_name,
+                        cache_dir=temp_cache_dir)
+                    print('Downloaded file path: %s' % file_path)
+                    assert os.path.exists(file_path)
+                    file_path = model_file_download(
+                        self.model_id,
+                        download_model_file_name2,
+                        cache_dir=temp_cache_dir)
+                    assert os.path.exists(file_path)
+            finally:
+                version.__release_datetime__ = release_datetime_backup
+
+
+if __name__ == '__main__':
+    unittest.main()

From 13f7e9ceca1e644386ccf4887a948d477b550b40 Mon Sep 17 00:00:00 2001
From: "ran.zhou" <ran.zhou@alibaba-inc.com>
Date: Wed, 26 Oct 2022 14:52:22 +0800
Subject: [PATCH 126/129] [to #42322933]SEA multilingual NLP (NER & word
 segmentation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加东南亚小语种NLP支持，包括：
1. 针对泰语，越南语NER的预处理
2. 基于XLMR-CRF架构的分词模型和pipeline
3. 针对泰语分词的预处理

添加了相应pipeline的unittest
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10492404
---
 modelscope/metainfo.py                        |   9 +
 modelscope/models/nlp/__init__.py             |  52 +-
 modelscope/models/nlp/task_models/__init__.py |   9 +-
 .../nncrf_for_word_segmentation.py            | 639 ++++++++++++++++++
 modelscope/pipelines/nlp/__init__.py          |  17 +-
 ...multilingual_word_segmentation_pipeline.py | 125 ++++
 .../nlp/named_entity_recognition_pipeline.py  |  44 +-
 modelscope/preprocessors/__init__.py          |   5 +-
 modelscope/preprocessors/nlp/__init__.py      |  18 +-
 .../token_classification_thai_preprocessor.py |  44 ++
 .../token_classification_viet_preprocessor.py |  33 +
 requirements/nlp.txt                          |   2 +
 ...t_multilingual_named_entity_recognition.py | 102 +++
 .../test_multilingual_word_segmentation.py    |  57 ++
 14 files changed, 1124 insertions(+), 32 deletions(-)
 create mode 100644 modelscope/models/nlp/task_models/nncrf_for_word_segmentation.py
 create mode 100644 modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
 create mode 100644 modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
 create mode 100644 modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py
 create mode 100644 tests/pipelines/test_multilingual_named_entity_recognition.py
 create mode 100644 tests/pipelines/test_multilingual_word_segmentation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index c5067c39..7944d1ed 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -70,8 +70,10 @@ class Models(object):
     space_T_en = 'space-T-en'
     space_T_cn = 'space-T-cn'
     tcrf = 'transformer-crf'
+    tcrf_wseg = 'transformer-crf-for-word-segmentation'
     transformer_softmax = 'transformer-softmax'
     lcrf = 'lstm-crf'
+    lcrf_wseg = 'lstm-crf-for-word-segmentation'
     gcnncrf = 'gcnn-crf'
     bart = 'bart'
     gpt3 = 'gpt3'
@@ -219,8 +221,12 @@ class Pipelines(object):
     domain_classification = 'domain-classification'
     sentence_similarity = 'sentence-similarity'
     word_segmentation = 'word-segmentation'
+    multilingual_word_segmentation = 'multilingual-word-segmentation'
+    word_segmentation_thai = 'word-segmentation-thai'
     part_of_speech = 'part-of-speech'
     named_entity_recognition = 'named-entity-recognition'
+    named_entity_recognition_thai = 'named-entity-recognition-thai'
+    named_entity_recognition_viet = 'named-entity-recognition-viet'
     text_generation = 'text-generation'
     text2text_generation = 'text2text-generation'
     sentiment_analysis = 'sentiment-analysis'
@@ -343,6 +349,8 @@ class Preprocessors(object):
     text2text_translate_preprocessor = 'text2text-translate-preprocessor'
     token_cls_tokenizer = 'token-cls-tokenizer'
     ner_tokenizer = 'ner-tokenizer'
+    thai_ner_tokenizer = 'thai-ner-tokenizer'
+    viet_ner_tokenizer = 'viet-ner-tokenizer'
     nli_tokenizer = 'nli-tokenizer'
     sen_cls_tokenizer = 'sen-cls-tokenizer'
     dialog_intent_preprocessor = 'dialog-intent-preprocessor'
@@ -355,6 +363,7 @@ class Preprocessors(object):
     text_ranking = 'text-ranking'
     sequence_labeling_tokenizer = 'sequence-labeling-tokenizer'
     word_segment_text_to_label_preprocessor = 'word-segment-text-to-label-preprocessor'
+    thai_wseg_tokenizer = 'thai-wseg-tokenizer'
     fill_mask = 'fill-mask'
     fill_mask_ponet = 'fill-mask-ponet'
     faq_question_answering_preprocessor = 'faq-question-answering-preprocessor'
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 5ae93caa..d4562f10 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -5,14 +5,26 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .bart import BartForTextErrorCorrection
+    from .bert import (
+        BertForMaskedLM,
+        BertForTextRanking,
+        BertForSentenceEmbedding,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForDocumentSegmentation,
+        BertModel,
+        BertConfig,
+    )
     from .csanmt import CsanmtForTranslation
-    from .heads import SequenceClassificationHead
+    from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
+    from .gpt_neo import GPTNeoModel
     from .gpt3 import GPT3ForTextGeneration
+    from .heads import SequenceClassificationHead
     from .palm_v2 import PalmForTextGeneration
-    from .space_T_en import StarForTextToSql
-    from .space_T_cn import TableQuestionAnswering
-    from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
     from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig
+    from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
+    from .space_T_cn import TableQuestionAnswering
+    from .space_T_en import StarForTextToSql
     from .structbert import (
         SbertForFaqQuestionAnswering,
         SbertForMaskedLM,
@@ -22,19 +34,7 @@ if TYPE_CHECKING:
         SbertModel,
         SbertTokenizerFast,
     )
-    from .bert import (
-        BertForMaskedLM,
-        BertForTextRanking,
-        BertForSentenceEmbedding,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        BertForDocumentSegmentation,
-        BertModel,
-        BertConfig,
-    )
-    from .veco import VecoModel, VecoConfig, VecoForTokenClassification, \
-        VecoForSequenceClassification, VecoForMaskedLM, VecoTokenizer, VecoTokenizerFast
-    from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
+    from .T5 import T5ForConditionalGeneration
     from .task_models import (
         FeatureExtractionModel,
         InformationExtractionModel,
@@ -45,9 +45,11 @@ if TYPE_CHECKING:
         TokenClassificationModel,
         TransformerCRFForNamedEntityRecognition,
     )
+    from .veco import (VecoConfig, VecoForMaskedLM,
+                       VecoForSequenceClassification,
+                       VecoForTokenClassification, VecoModel, VecoTokenizer,
+                       VecoTokenizerFast)
 
-    from .T5 import T5ForConditionalGeneration
-    from .gpt_neo import GPTNeoModel
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
@@ -65,9 +67,13 @@ else:
             'SbertModel',
         ],
         'veco': [
-            'VecoModel', 'VecoConfig', 'VecoForTokenClassification',
-            'VecoForSequenceClassification', 'VecoForMaskedLM',
-            'VecoTokenizer', 'VecoTokenizerFast'
+            'VecoConfig',
+            'VecoForMaskedLM',
+            'VecoForSequenceClassification',
+            'VecoForTokenClassification',
+            'VecoModel',
+            'VecoTokenizer',
+            'VecoTokenizerFast',
         ],
         'bert': [
             'BertForMaskedLM',
@@ -90,11 +96,13 @@ else:
             'FeatureExtractionModel',
             'InformationExtractionModel',
             'LSTMCRFForNamedEntityRecognition',
+            'LSTMCRFForWordSegmentation',
             'SequenceClassificationModel',
             'SingleBackboneTaskModelBase',
             'TaskModelForTextGeneration',
             'TokenClassificationModel',
             'TransformerCRFForNamedEntityRecognition',
+            'TransformerCRFForWordSegmentation',
         ],
         'sentence_embedding': ['SentenceEmbedding'],
         'T5': ['T5ForConditionalGeneration'],
diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py
index e733efe2..b8722a36 100644
--- a/modelscope/models/nlp/task_models/__init__.py
+++ b/modelscope/models/nlp/task_models/__init__.py
@@ -8,8 +8,13 @@ if TYPE_CHECKING:
     from .feature_extraction import FeatureExtractionModel
     from .fill_mask import FillMaskModel
     from .nncrf_for_named_entity_recognition import (
+        LSTMCRFForNamedEntityRecognition,
         TransformerCRFForNamedEntityRecognition,
-        LSTMCRFForNamedEntityRecognition)
+    )
+    from .nncrf_for_word_segmentation import (
+        LSTMCRFForWordSegmentation,
+        TransformerCRFForWordSegmentation,
+    )
     from .sequence_classification import SequenceClassificationModel
     from .task_model import SingleBackboneTaskModelBase
     from .token_classification import TokenClassificationModel
@@ -24,6 +29,8 @@ else:
             'TransformerCRFForNamedEntityRecognition',
             'LSTMCRFForNamedEntityRecognition'
         ],
+        'nncrf_for_word_segmentation':
+        ['TransformerCRFForWordSegmentation', 'LSTMCRFForWordSegmentation'],
         'sequence_classification': ['SequenceClassificationModel'],
         'task_model': ['SingleBackboneTaskModelBase'],
         'token_classification': ['TokenClassificationModel'],
diff --git a/modelscope/models/nlp/task_models/nncrf_for_word_segmentation.py b/modelscope/models/nlp/task_models/nncrf_for_word_segmentation.py
new file mode 100644
index 00000000..2a3f6cf4
--- /dev/null
+++ b/modelscope/models/nlp/task_models/nncrf_for_word_segmentation.py
@@ -0,0 +1,639 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved.
+# The CRF implementation borrows mostly from AllenNLP CRF module (https://github.com/allenai/allennlp)
+# and pytorch-crf (https://github.com/kmkurn/pytorch-crf) with some modifications.
+
+import os
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import TokenClassifierWithPredictionsOutput
+from modelscope.utils.constant import ModelFile, Tasks
+
+__all__ = ['TransformerCRFForWordSegmentation', 'LSTMCRFForWordSegmentation']
+
+
+class SequenceLabelingForWordSegmentation(TorchModel):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.model = self.init_model(model_dir, *args, **kwargs)
+
+        model_ckpt = os.path.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
+        self.model.load_state_dict(
+            torch.load(model_ckpt, map_location=torch.device('cpu')))
+
+    def init_model(self, model_dir, *args, **kwargs):
+        raise NotImplementedError
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input_tensor = {
+            'input_ids': input['input_ids'],
+            'attention_mask': input['attention_mask'],
+            'label_mask': input['label_mask'],
+        }
+        output = {
+            'offset_mapping': input['offset_mapping'],
+            **input_tensor,
+            **self.model(input_tensor)
+        }
+        return output
+
+    def postprocess(self, input: Dict[str, Any], **kwargs):
+        predicts = self.model.decode(input)
+        offset_len = len(input['offset_mapping'])
+        predictions = torch.narrow(
+            predicts, 1, 0,
+            offset_len)  # index_select only move loc, not resize
+        return TokenClassifierWithPredictionsOutput(
+            loss=None,
+            logits=None,
+            hidden_states=None,
+            attentions=None,
+            offset_mapping=input['offset_mapping'],
+            predictions=predictions,
+        )
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.tcrf_wseg)
+class TransformerCRFForWordSegmentation(SequenceLabelingForWordSegmentation):
+    """This model wraps the TransformerCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        num_labels = self.config.num_labels
+
+        model = TransformerCRF(model_dir, num_labels)
+        return model
+
+
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.lcrf_wseg)
+class LSTMCRFForWordSegmentation(SequenceLabelingForWordSegmentation):
+    """This model wraps the LSTMCRF model to register into model sets.
+    """
+
+    def init_model(self, model_dir, *args, **kwargs):
+        self.config = AutoConfig.from_pretrained(model_dir)
+        vocab_size = self.config.vocab_size
+        embed_width = self.config.embed_width
+        num_labels = self.config.num_labels
+        lstm_hidden_size = self.config.lstm_hidden_size
+
+        model = LSTMCRF(vocab_size, embed_width, num_labels, lstm_hidden_size)
+        return model
+
+
+class TransformerCRF(nn.Module):
+    """A transformer based model to NER tasks.
+
+    This model will use transformers' backbones as its backbone.
+    """
+
+    def __init__(self, model_dir, num_labels, **kwargs):
+        super(TransformerCRF, self).__init__()
+
+        self.encoder = AutoModel.from_pretrained(model_dir)
+        self.linear = nn.Linear(self.encoder.config.hidden_size, num_labels)
+        self.crf = CRF(num_labels, batch_first=True)
+
+    def forward(self, inputs):
+        embed = self.encoder(
+            inputs['input_ids'], attention_mask=inputs['attention_mask'])[0]
+        logits = self.linear(embed)
+
+        if 'label_mask' in inputs:
+            mask = inputs['label_mask']
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
+        outputs = {'logits': logits}
+        return outputs
+
+    def decode(self, inputs):
+        seq_lens = inputs['label_mask'].sum(-1).long()
+        mask = torch.arange(
+            inputs['label_mask'].shape[1],
+            device=seq_lens.device)[None, :] < seq_lens[:, None]
+        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+
+        return predicts
+
+
+class LSTMCRF(nn.Module):
+    """
+    A standard bilstm-crf model for fast prediction.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embed_width,
+                 num_labels,
+                 lstm_hidden_size=100,
+                 **kwargs):
+        super(LSTMCRF, self).__init__()
+        self.embedding = Embedding(vocab_size, embed_width)
+        self.lstm = nn.LSTM(
+            embed_width,
+            lstm_hidden_size,
+            num_layers=1,
+            bidirectional=True,
+            batch_first=True)
+        self.ffn = nn.Linear(lstm_hidden_size * 2, num_labels)
+        self.crf = CRF(num_labels, batch_first=True)
+
+    def forward(self, inputs):
+        embedding = self.embedding(inputs['input_ids'])
+        lstm_output, _ = self.lstm(embedding)
+        logits = self.ffn(lstm_output)
+
+        if 'label_mask' in inputs:
+            mask = inputs['label_mask']
+            masked_lengths = mask.sum(-1).long()
+            masked_logits = torch.zeros_like(logits)
+            for i in range(len(mask)):
+                masked_logits[
+                    i, :masked_lengths[i], :] = logits[i].masked_select(
+                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
+            logits = masked_logits
+
+        outputs = {'logits': logits}
+        return outputs
+
+    def decode(self, inputs):
+        seq_lens = inputs['label_mask'].sum(-1).long()
+        mask = torch.arange(
+            inputs['label_mask'].shape[1],
+            device=seq_lens.device)[None, :] < seq_lens[:, None]
+        predicts = self.crf.decode(inputs['logits'], mask=mask).squeeze(0)
+        outputs = {'predicts': predicts}
+        return outputs
+
+
+class CRF(nn.Module):
+    """Conditional random field.
+    This module implements a conditional random field [LMP01]_. The forward computation
+    of this class computes the log likelihood of the given sequence of tags and
+    emission score tensor. This class also has `~CRF.decode` method which finds
+    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
+    Args:
+        num_tags: Number of tags.
+        batch_first: Whether the first dimension corresponds to the size of a minibatch.
+    Attributes:
+        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
+            ``(num_tags,)``.
+        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
+            ``(num_tags,)``.
+        transitions (`~torch.nn.Parameter`): Transition score tensor of size
+            ``(num_tags, num_tags)``.
+    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
+       "Conditional random fields: Probabilistic models for segmenting and
+       labeling sequence data". *Proc. 18th International Conf. on Machine
+       Learning*. Morgan Kaufmann. pp. 282–289.
+    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
+
+    """
+
+    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
+        if num_tags <= 0:
+            raise ValueError(f'invalid number of tags: {num_tags}')
+        super().__init__()
+        self.num_tags = num_tags
+        self.batch_first = batch_first
+        self.start_transitions = nn.Parameter(torch.empty(num_tags))
+        self.end_transitions = nn.Parameter(torch.empty(num_tags))
+        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """Initialize the transition parameters.
+        The parameters will be initialized randomly from a uniform distribution
+        between -0.1 and 0.1.
+        """
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(num_tags={self.num_tags})'
+
+    def forward(self,
+                emissions: torch.Tensor,
+                tags: torch.LongTensor,
+                mask: Optional[torch.ByteTensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+        """Compute the conditional log likelihood of a sequence of tags given emission scores.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            tags (`~torch.LongTensor`): Sequence of tags tensor of size
+                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            reduction: Specifies  the reduction to apply to the output:
+                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
+                ``sum``: the output will be summed over batches. ``mean``: the output will be
+                averaged over batches. ``token_mean``: the output will be averaged over tokens.
+        Returns:
+            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
+            reduction is ``none``, ``()`` otherwise.
+        """
+        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
+            raise ValueError(f'invalid reduction: {reduction}')
+        if mask is None:
+            mask = torch.ones_like(tags, dtype=torch.uint8, device=tags.device)
+        if mask.dtype != torch.uint8:
+            mask = mask.byte()
+        self._validate(emissions, tags=tags, mask=mask)
+
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+
+        # shape: (batch_size,)
+        numerator = self._compute_score(emissions, tags, mask)
+        # shape: (batch_size,)
+        denominator = self._compute_normalizer(emissions, mask)
+        # shape: (batch_size,)
+        llh = numerator - denominator
+
+        if reduction == 'none':
+            return llh
+        if reduction == 'sum':
+            return llh.sum()
+        if reduction == 'mean':
+            return llh.mean()
+        return llh.sum() / mask.float().sum()
+
+    def decode(self,
+               emissions: torch.Tensor,
+               mask: Optional[torch.ByteTensor] = None,
+               nbest: Optional[int] = None,
+               pad_tag: Optional[int] = None) -> List[List[List[int]]]:
+        """Find the most likely tag sequence using Viterbi algorithm.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            nbest (`int`): Number of most probable paths for each sequence
+            pad_tag (`int`): Tag at padded positions. Often input varies in length and
+                the length will be padded to the maximum length in the batch. Tags at
+                the padded positions will be assigned with a padding tag, i.e. `pad_tag`
+        Returns:
+            A PyTorch tensor of the best tag sequence for each batch of shape
+            (nbest, batch_size, seq_length)
+        """
+        if nbest is None:
+            nbest = 1
+        if mask is None:
+            mask = torch.ones(
+                emissions.shape[:2],
+                dtype=torch.uint8,
+                device=emissions.device)
+        if mask.dtype != torch.uint8:
+            mask = mask.byte()
+        self._validate(emissions, mask=mask)
+
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+
+        if nbest == 1:
+            return self._viterbi_decode(emissions, mask, pad_tag).unsqueeze(0)
+        return self._viterbi_decode_nbest(emissions, mask, nbest, pad_tag)
+
+    def _validate(self,
+                  emissions: torch.Tensor,
+                  tags: Optional[torch.LongTensor] = None,
+                  mask: Optional[torch.ByteTensor] = None) -> None:
+        if emissions.dim() != 3:
+            raise ValueError(
+                f'emissions must have dimension of 3, got {emissions.dim()}')
+        if emissions.size(2) != self.num_tags:
+            raise ValueError(
+                f'expected last dimension of emissions is {self.num_tags}, '
+                f'got {emissions.size(2)}')
+
+        if tags is not None:
+            if emissions.shape[:2] != tags.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and tags must match, '
+                    f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}'
+                )
+
+        if mask is not None:
+            if emissions.shape[:2] != mask.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and mask must match, '
+                    f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}'
+                )
+            no_empty_seq = not self.batch_first and mask[0].all()
+            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
+            if not no_empty_seq and not no_empty_seq_bf:
+                raise ValueError('mask of the first timestep must all be on')
+
+    def _compute_score(self, emissions: torch.Tensor, tags: torch.LongTensor,
+                       mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # tags: (seq_length, batch_size)
+        # mask: (seq_length, batch_size)
+        seq_length, batch_size = tags.shape
+        mask = mask.float()
+
+        # Start transition score and first emission
+        # shape: (batch_size,)
+        score = self.start_transitions[tags[0]]
+        score += emissions[0, torch.arange(batch_size), tags[0]]
+
+        for i in range(1, seq_length):
+            # Transition score to next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += self.transitions[tags[i - 1], tags[i]] * mask[i]
+
+            # Emission score for next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
+
+        # End transition score
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+        # shape: (batch_size,)
+        last_tags = tags[seq_ends, torch.arange(batch_size)]
+        # shape: (batch_size,)
+        score += self.end_transitions[last_tags]
+
+        return score
+
+    def _compute_normalizer(self, emissions: torch.Tensor,
+                            mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        seq_length = emissions.size(0)
+
+        # Start transition score and first emission; score has size of
+        # (batch_size, num_tags) where for each batch, the j-th column stores
+        # the score that the first timestep has tag j
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+
+        for i in range(1, seq_length):
+            # Broadcast score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emissions = emissions[i].unsqueeze(1)
+
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the sum of scores of all
+            # possible tag sequences so far that end with transitioning from tag i to tag j
+            # and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emissions
+
+            # Sum over all possible current tags, but we're in score space, so a sum
+            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
+            # all possible tag sequences so far, that end in tag i
+            # shape: (batch_size, num_tags)
+            next_score = torch.logsumexp(next_score, dim=1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+
+        # End transition score
+        # shape: (batch_size, num_tags)
+        score += self.end_transitions
+
+        # Sum (log-sum-exp) over all possible tags
+        # shape: (batch_size,)
+        return torch.logsumexp(score, dim=1)
+
+    def _viterbi_decode(self,
+                        emissions: torch.FloatTensor,
+                        mask: torch.ByteTensor,
+                        pad_tag: Optional[int] = None) -> List[List[int]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        # return: (batch_size, seq_length)
+        if pad_tag is None:
+            pad_tag = 0
+
+        device = emissions.device
+        seq_length, batch_size = mask.shape
+
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history_idx = torch.zeros((seq_length, batch_size, self.num_tags),
+                                  dtype=torch.long,
+                                  device=device)
+        oor_idx = torch.zeros((batch_size, self.num_tags),
+                              dtype=torch.long,
+                              device=device)
+        oor_tag = torch.full((seq_length, batch_size),
+                             pad_tag,
+                             dtype=torch.long,
+                             device=device)
+
+        # - score is a tensor of size (batch_size, num_tags) where for every batch,
+        #   value at column j stores the score of the best tag sequence so far that ends
+        #   with tag j
+        # - history_idx saves where the best tags candidate transitioned from; this is used
+        #   when we trace back the best tag sequence
+        # - oor_idx saves the best tags candidate transitioned from at the positions
+        #   where mask is 0, i.e. out of range (oor)
+
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            # Broadcast viterbi score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emission = emissions[i].unsqueeze(1)
+
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the score of the best
+            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emission
+
+            # Find the maximum score over all possible current tag
+            # shape: (batch_size, num_tags)
+            next_score, indices = next_score.max(dim=1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(-1), next_score, score)
+            indices = torch.where(mask[i].unsqueeze(-1), indices, oor_idx)
+            history_idx[i - 1] = indices
+
+        # End transition score
+        # shape: (batch_size, num_tags)
+        end_score = score + self.end_transitions
+        _, end_tag = end_score.max(dim=1)
+
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+
+        # insert the best tag at each sequence end (last position with mask == 1)
+        history_idx = history_idx.transpose(1, 0).contiguous()
+        history_idx.scatter_(
+            1,
+            seq_ends.view(-1, 1, 1).expand(-1, 1, self.num_tags),
+            end_tag.view(-1, 1, 1).expand(-1, 1, self.num_tags))
+        history_idx = history_idx.transpose(1, 0).contiguous()
+
+        # The most probable path for each sequence
+        best_tags_arr = torch.zeros((seq_length, batch_size),
+                                    dtype=torch.long,
+                                    device=device)
+        best_tags = torch.zeros(batch_size, 1, dtype=torch.long, device=device)
+        for idx in range(seq_length - 1, -1, -1):
+            best_tags = torch.gather(history_idx[idx], 1, best_tags)
+            best_tags_arr[idx] = best_tags.data.view(batch_size)
+
+        return torch.where(mask, best_tags_arr, oor_tag).transpose(0, 1)
+
+    def _viterbi_decode_nbest(
+            self,
+            emissions: torch.FloatTensor,
+            mask: torch.ByteTensor,
+            nbest: int,
+            pad_tag: Optional[int] = None) -> List[List[List[int]]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        # return: (nbest, batch_size, seq_length)
+        if pad_tag is None:
+            pad_tag = 0
+
+        device = emissions.device
+        seq_length, batch_size = mask.shape
+
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history_idx = torch.zeros(
+            (seq_length, batch_size, self.num_tags, nbest),
+            dtype=torch.long,
+            device=device)
+        oor_idx = torch.zeros((batch_size, self.num_tags, nbest),
+                              dtype=torch.long,
+                              device=device)
+        oor_tag = torch.full((seq_length, batch_size, nbest),
+                             pad_tag,
+                             dtype=torch.long,
+                             device=device)
+
+        # + score is a tensor of size (batch_size, num_tags) where for every batch,
+        #   value at column j stores the score of the best tag sequence so far that ends
+        #   with tag j
+        # + history_idx saves where the best tags candidate transitioned from; this is used
+        #   when we trace back the best tag sequence
+        # - oor_idx saves the best tags candidate transitioned from at the positions
+        #   where mask is 0, i.e. out of range (oor)
+
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            if i == 1:
+                broadcast_score = score.unsqueeze(-1)
+                broadcast_emission = emissions[i].unsqueeze(1)
+                # shape: (batch_size, num_tags, num_tags)
+                next_score = broadcast_score + self.transitions + broadcast_emission
+            else:
+                broadcast_score = score.unsqueeze(-1)
+                broadcast_emission = emissions[i].unsqueeze(1).unsqueeze(2)
+                # shape: (batch_size, num_tags, nbest, num_tags)
+                next_score = broadcast_score + self.transitions.unsqueeze(
+                    1) + broadcast_emission
+
+            # Find the top `nbest` maximum score over all possible current tag
+            # shape: (batch_size, nbest, num_tags)
+            next_score, indices = next_score.view(batch_size, -1,
+                                                  self.num_tags).topk(
+                                                      nbest, dim=1)
+
+            if i == 1:
+                score = score.unsqueeze(-1).expand(-1, -1, nbest)
+                indices = indices * nbest
+
+            # convert to shape: (batch_size, num_tags, nbest)
+            next_score = next_score.transpose(2, 1)
+            indices = indices.transpose(2, 1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags, nbest)
+            score = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1),
+                                next_score, score)
+            indices = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), indices,
+                                  oor_idx)
+            history_idx[i - 1] = indices
+
+        # End transition score shape: (batch_size, num_tags, nbest)
+        end_score = score + self.end_transitions.unsqueeze(-1)
+        _, end_tag = end_score.view(batch_size, -1).topk(nbest, dim=1)
+
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+
+        # insert the best tag at each sequence end (last position with mask == 1)
+        history_idx = history_idx.transpose(1, 0).contiguous()
+        history_idx.scatter_(
+            1,
+            seq_ends.view(-1, 1, 1, 1).expand(-1, 1, self.num_tags, nbest),
+            end_tag.view(-1, 1, 1, nbest).expand(-1, 1, self.num_tags, nbest))
+        history_idx = history_idx.transpose(1, 0).contiguous()
+
+        # The most probable path for each sequence
+        best_tags_arr = torch.zeros((seq_length, batch_size, nbest),
+                                    dtype=torch.long,
+                                    device=device)
+        best_tags = torch.arange(nbest, dtype=torch.long, device=device) \
+                         .view(1, -1).expand(batch_size, -1)
+        for idx in range(seq_length - 1, -1, -1):
+            best_tags = torch.gather(history_idx[idx].view(batch_size, -1), 1,
+                                     best_tags)
+            best_tags_arr[idx] = best_tags.data.view(batch_size, -1) // nbest
+
+        return torch.where(mask.unsqueeze(-1), best_tags_arr,
+                           oor_tag).permute(2, 1, 0)
+
+
+class Embedding(nn.Module):
+
+    def __init__(self, vocab_size, embed_width):
+        super(Embedding, self).__init__()
+
+        self.embedding = nn.Embedding(vocab_size, embed_width)
+
+    def forward(self, input_ids):
+        return self.embedding(input_ids)
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 73bd0d8c..7b726308 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -16,7 +16,9 @@ if TYPE_CHECKING:
     from .feature_extraction_pipeline import FeatureExtractionPipeline
     from .fill_mask_pipeline import FillMaskPipeline
     from .information_extraction_pipeline import InformationExtractionPipeline
-    from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline
+    from .named_entity_recognition_pipeline import NamedEntityRecognitionPipeline, \
+        NamedEntityRecognitionThaiPipeline, \
+        NamedEntityRecognitionVietPipeline
     from .text_ranking_pipeline import TextRankingPipeline
     from .sentence_embedding_pipeline import SentenceEmbeddingPipeline
     from .text_classification_pipeline import TextClassificationPipeline
@@ -29,6 +31,8 @@ if TYPE_CHECKING:
     from .translation_pipeline import TranslationPipeline
     from .word_segmentation_pipeline import WordSegmentationPipeline
     from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline
+    from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \
+        WordSegmentationThaiPipeline
 
 else:
     _import_structure = {
@@ -46,8 +50,11 @@ else:
         'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
         'fill_mask_pipeline': ['FillMaskPipeline'],
         'information_extraction_pipeline': ['InformationExtractionPipeline'],
-        'named_entity_recognition_pipeline':
-        ['NamedEntityRecognitionPipeline'],
+        'named_entity_recognition_pipeline': [
+            'NamedEntityRecognitionPipeline',
+            'NamedEntityRecognitionThaiPipeline',
+            'NamedEntityRecognitionVietPipeline'
+        ],
         'text_ranking_pipeline': ['TextRankingPipeline'],
         'sentence_embedding_pipeline': ['SentenceEmbeddingPipeline'],
         'summarization_pipeline': ['SummarizationPipeline'],
@@ -64,6 +71,10 @@ else:
         'word_segmentation_pipeline': ['WordSegmentationPipeline'],
         'zero_shot_classification_pipeline':
         ['ZeroShotClassificationPipeline'],
+        'multilingual_word_segmentation_pipeline': [
+            'MultilingualWordSegmentationPipeline',
+            'WordSegmentationThaiPipeline'
+        ],
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py b/modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
new file mode 100644
index 00000000..56c3a041
--- /dev/null
+++ b/modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
@@ -0,0 +1,125 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import (Preprocessor,
+                                      TokenClassificationPreprocessor,
+                                      WordSegmentationPreprocessorThai)
+from modelscope.utils.constant import Tasks
+
+__all__ = [
+    'MultilingualWordSegmentationPipeline', 'WordSegmentationThaiPipeline'
+]
+
+
+@PIPELINES.register_module(
+    Tasks.word_segmentation,
+    module_name=Pipelines.multilingual_word_segmentation)
+class MultilingualWordSegmentationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported word segmentation task, or a
+            model id from the model hub, or a torch model instance.
+            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied.
+            sequence_length: Max sequence length in the user's custom scenario. 512 will be used as a default value.
+
+            To view other examples plese check the tests/pipelines/test_multilingual_word_segmentation.py.
+        """
+
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = TokenClassificationPreprocessor(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.tokenizer = preprocessor.tokenizer
+        self.config = model.config
+        assert len(self.config.id2label) > 0
+        self.id2label = self.config.id2label
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        text = inputs.pop(OutputKeys.TEXT)
+        with torch.no_grad():
+            return {
+                **super().forward(inputs, **forward_params), OutputKeys.TEXT:
+                text
+            }
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        text = inputs['text']
+        offset_mapping = [x.cpu().tolist() for x in inputs['offset_mapping']]
+        labels = [
+            self.id2label[x]
+            for x in inputs['predictions'].squeeze(0).cpu().numpy()
+        ]
+        entities = []
+        entity = {}
+        for label, offsets in zip(labels, offset_mapping):
+            if label[0] in 'BS':
+                if entity:
+                    entity['span'] = text[entity['start']:entity['end']]
+                    entities.append(entity)
+                entity = {
+                    'type': label[2:],
+                    'start': offsets[0],
+                    'end': offsets[1]
+                }
+            if label[0] in 'IES':
+                if entity:
+                    entity['end'] = offsets[1]
+            if label[0] in 'ES':
+                if entity:
+                    entity['span'] = text[entity['start']:entity['end']]
+                    entities.append(entity)
+                    entity = {}
+        if entity:
+            entity['span'] = text[entity['start']:entity['end']]
+            entities.append(entity)
+
+        word_segments = [entity['span'] for entity in entities]
+        outputs = {OutputKeys.OUTPUT: word_segments, OutputKeys.LABELS: []}
+
+        return outputs
+
+
+@PIPELINES.register_module(
+    Tasks.word_segmentation, module_name=Pipelines.word_segmentation_thai)
+class WordSegmentationThaiPipeline(MultilingualWordSegmentationPipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = WordSegmentationPreprocessorThai(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
+        outputs = super().postprocess(inputs, **postprocess_params)
+        word_segments = outputs[OutputKeys.OUTPUT]
+        word_segments = [seg.replace(' ', '') for seg in word_segments]
+
+        return {OutputKeys.OUTPUT: word_segments, OutputKeys.LABELS: []}
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 8d8c4542..fdcf9e0f 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -9,13 +9,17 @@ from modelscope.models import Model
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (Preprocessor,
+from modelscope.preprocessors import (NERPreprocessorThai, NERPreprocessorViet,
+                                      Preprocessor,
                                       TokenClassificationPreprocessor)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.tensor_utils import (torch_nested_detach,
                                            torch_nested_numpify)
 
-__all__ = ['NamedEntityRecognitionPipeline']
+__all__ = [
+    'NamedEntityRecognitionPipeline', 'NamedEntityRecognitionThaiPipeline',
+    'NamedEntityRecognitionVietPipeline'
+]
 
 
 @PIPELINES.register_module(
@@ -126,3 +130,39 @@ class NamedEntityRecognitionPipeline(Pipeline):
         else:
             outputs = {OutputKeys.OUTPUT: chunks}
         return outputs
+
+
+@PIPELINES.register_module(
+    Tasks.named_entity_recognition,
+    module_name=Pipelines.named_entity_recognition_thai)
+class NamedEntityRecognitionThaiPipeline(NamedEntityRecognitionPipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = NERPreprocessorThai(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+
+@PIPELINES.register_module(
+    Tasks.named_entity_recognition,
+    module_name=Pipelines.named_entity_recognition_viet)
+class NamedEntityRecognitionVietPipeline(NamedEntityRecognitionPipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        model = model if isinstance(model,
+                                    Model) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = NERPreprocessorViet(
+                model.model_dir,
+                sequence_length=kwargs.pop('sequence_length', 512))
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 76c6d877..e568098f 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -28,7 +28,8 @@ if TYPE_CHECKING:
         SentencePiecePreprocessor, DialogIntentPredictionPreprocessor,
         DialogModelingPreprocessor, DialogStateTrackingPreprocessor,
         ConversationalTextToSqlPreprocessor,
-        TableQuestionAnsweringPreprocessor)
+        TableQuestionAnsweringPreprocessor, NERPreprocessorViet,
+        NERPreprocessorThai, WordSegmentationPreprocessorThai)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
 
 else:
@@ -58,6 +59,8 @@ else:
             'WordSegmentationBlankSetToLabelPreprocessor',
             'ZeroShotClassificationPreprocessor',
             'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor',
+            'NERPreprocessorViet', 'NERPreprocessorThai',
+            'WordSegmentationPreprocessorThai',
             'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
             'DialogStateTrackingPreprocessor',
             'ConversationalTextToSqlPreprocessor',
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index ea7b6bf4..d9c55fe1 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
     from .text2text_generation_preprocessor import Text2TextGenerationPreprocessor
     from .token_classification_preprocessor import TokenClassificationPreprocessor, \
         WordSegmentationBlankSetToLabelPreprocessor
+    from .token_classification_thai_preprocessor import WordSegmentationPreprocessorThai, NERPreprocessorThai
+    from .token_classification_viet_preprocessor import NERPreprocessorViet
     from .zero_shot_classification_reprocessor import ZeroShotClassificationPreprocessor
     from .space import (DialogIntentPredictionPreprocessor,
                         DialogModelingPreprocessor,
@@ -60,10 +62,20 @@ else:
         'text_error_correction': [
             'TextErrorCorrectionPreprocessor',
         ],
+        'token_classification_thai_preprocessor': [
+            'NERPreprocessorThai',
+            'WordSegmentationPreprocessorThai',
+        ],
+        'token_classification_viet_preprocessor': [
+            'NERPreprocessorViet',
+        ],
         'space': [
-            'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor',
-            'DialogStateTrackingPreprocessor', 'InputFeatures',
-            'MultiWOZBPETextField', 'IntentBPETextField'
+            'DialogIntentPredictionPreprocessor',
+            'DialogModelingPreprocessor',
+            'DialogStateTrackingPreprocessor',
+            'InputFeatures',
+            'MultiWOZBPETextField',
+            'IntentBPETextField',
         ],
         'space_T_en': ['ConversationalTextToSqlPreprocessor'],
         'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
diff --git a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
new file mode 100644
index 00000000..a356cea7
--- /dev/null
+++ b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Tuple, Union
+
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .token_classification_preprocessor import TokenClassificationPreprocessor
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.thai_ner_tokenizer)
+class NERPreprocessorThai(TokenClassificationPreprocessor):
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        from pythainlp import word_tokenize
+
+        segmented_data = ' '.join([
+            w.strip(' ') for w in word_tokenize(text=data, engine='newmm')
+            if w.strip(' ') != ''
+        ])
+        output = super().__call__(segmented_data)
+
+        return output
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.thai_wseg_tokenizer)
+class WordSegmentationPreprocessorThai(TokenClassificationPreprocessor):
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        import regex
+        data = regex.findall(r'\X', data)
+        data = ' '.join([char for char in data])
+
+        output = super().__call__(data)
+
+        return output
diff --git a/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py
new file mode 100644
index 00000000..f8970d1a
--- /dev/null
+++ b/modelscope/preprocessors/nlp/token_classification_viet_preprocessor.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Tuple, Union
+
+import torch
+
+from modelscope.metainfo import Preprocessors
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+from .token_classification_preprocessor import TokenClassificationPreprocessor
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.viet_ner_tokenizer)
+class NERPreprocessorViet(TokenClassificationPreprocessor):
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        from pyvi import ViTokenizer
+
+        seg_words = [
+            t.strip(' ') for t in ViTokenizer.tokenize(data).split(' ')
+            if t.strip(' ') != ''
+        ]
+        raw_words = []
+        for w in seg_words:
+            raw_words.extend(w.split('_'))
+        segmented_data = ' '.join(raw_words)
+        output = super().__call__(segmented_data)
+
+        return output
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index a5f3cbd9..9a4abd71 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -4,6 +4,8 @@ megatron_util
 pai-easynlp
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>=3.19.0,<3.21.0
+pythainlp
+pyvi
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
diff --git a/tests/pipelines/test_multilingual_named_entity_recognition.py b/tests/pipelines/test_multilingual_named_entity_recognition.py
new file mode 100644
index 00000000..6f72c83c
--- /dev/null
+++ b/tests/pipelines/test_multilingual_named_entity_recognition.py
@@ -0,0 +1,102 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
+                                   TransformerCRFForNamedEntityRecognition)
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import (NamedEntityRecognitionThaiPipeline,
+                                      NamedEntityRecognitionVietPipeline)
+from modelscope.preprocessors import NERPreprocessorThai, NERPreprocessorViet
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
+                                             DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.named_entity_recognition
+        self.model_id = 'damo/nlp_xlmr_named-entity-recognition_thai-ecommerce-title'
+
+    thai_tcrf_model_id = 'damo/nlp_xlmr_named-entity-recognition_thai-ecommerce-title'
+    thai_sentence = 'เครื่องชั่งดิจิตอลแบบตั้งพื้น150kg.'
+
+    viet_tcrf_model_id = 'damo/nlp_xlmr_named-entity-recognition_viet-ecommerce-title'
+    viet_sentence = 'Nón vành dễ thương cho bé gái'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_tcrf_by_direct_model_download_thai(self):
+        cache_path = snapshot_download(self.thai_tcrf_model_id)
+        tokenizer = NERPreprocessorThai(cache_path)
+        model = TransformerCRFForNamedEntityRecognition(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = NamedEntityRecognitionThaiPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(f'thai_sentence: {self.thai_sentence}\n'
+              f'pipeline1:{pipeline1(input=self.thai_sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.thai_sentence)}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_tcrf_with_model_from_modelhub_thai(self):
+        model = Model.from_pretrained(self.thai_tcrf_model_id)
+        tokenizer = NERPreprocessorThai(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.thai_sentence))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_tcrf_with_model_name_thai(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.thai_tcrf_model_id)
+        print(pipeline_ins(input=self.thai_sentence))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_tcrf_by_direct_model_download_viet(self):
+        cache_path = snapshot_download(self.viet_tcrf_model_id)
+        tokenizer = NERPreprocessorViet(cache_path)
+        model = TransformerCRFForNamedEntityRecognition(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = NamedEntityRecognitionVietPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(f'viet_sentence: {self.viet_sentence}\n'
+              f'pipeline1:{pipeline1(input=self.viet_sentence)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.viet_sentence)}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_tcrf_with_model_from_modelhub_viet(self):
+        model = Model.from_pretrained(self.viet_tcrf_model_id)
+        tokenizer = NERPreprocessorViet(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.viet_sentence))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_tcrf_with_model_name_viet(self):
+        pipeline_ins = pipeline(
+            task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id)
+        print(pipeline_ins(input=self.viet_sentence))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_multilingual_word_segmentation.py b/tests/pipelines/test_multilingual_word_segmentation.py
new file mode 100644
index 00000000..25b4b241
--- /dev/null
+++ b/tests/pipelines/test_multilingual_word_segmentation.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import TransformerCRFForWordSegmentation
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import WordSegmentationThaiPipeline
+from modelscope.preprocessors import WordSegmentationPreprocessorThai
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
+from modelscope.utils.test_utils import test_level
+
+
+class WordSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.word_segmentation
+        self.model_id = 'damo/nlp_xlmr_word-segmentation_thai'
+
+    sentence = 'รถคันเก่าก็ยังเก็บเอาไว้ยังไม่ได้ขาย'
+    regress_tool = MsRegressTool(baseline=False)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = WordSegmentationPreprocessorThai(cache_path)
+        model = TransformerCRFForWordSegmentation.from_pretrained(cache_path)
+        pipeline1 = WordSegmentationThaiPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.word_segmentation, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.sentence}\n'
+              f'pipeline1:{pipeline1(input=self.sentence)}')
+        print(f'pipeline2: {pipeline2(input=self.sentence)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = WordSegmentationPreprocessorThai(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.word_segmentation, model=self.model_id)
+        print(pipeline_ins(input=self.sentence))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3b8fb92c136f9c139e7dfad2de0d164b413290a4 Mon Sep 17 00:00:00 2001
From: "caorongyu.cry" <caorongyu.cry@alibaba-inc.com>
Date: Wed, 26 Oct 2022 16:04:14 +0800
Subject: [PATCH 127/129] [to #42322933] debug header ids and header names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复header_ids和header_names命名反了的问题
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10516557
---
 .../nlp/table_question_answering_pipeline.py  | 66 +++++++++++++++----
 .../nlp/space_T_cn/fields/struct.py           | 22 +++++++
 .../test_table_question_answering.py          |  8 ++-
 3 files changed, 81 insertions(+), 15 deletions(-)

diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 826e35a9..b75a8153 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -69,6 +69,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
         self.max_where_num = constant.max_where_num
         self.col_type_dict = constant.col_type_dict
         self.schema_link_dict = constant.schema_link_dict
+        self.limit_dict = constant.limit_dict
 
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
@@ -244,7 +245,13 @@ class TableQuestionAnsweringPipeline(Pipeline):
                                     + header_id + ')')
 
         str_cond_list, sql_cond_list = [], []
+        where_conds, orderby_conds = [], []
         for cond in sql['conds']:
+            if cond[1] in [4, 5]:
+                orderby_conds.append(cond)
+            else:
+                where_conds.append(cond)
+        for cond in where_conds:
             header_name = header_names[cond[0]]
             if header_name == '空列':
                 continue
@@ -255,14 +262,49 @@ class TableQuestionAnsweringPipeline(Pipeline):
                                  + '" )')
             sql_cond_list.append('( ' + header_id + ' ' + op + ' "' + value
                                  + '" )')
-
-        cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
-
-        if len(str_cond_list) != 0:
-            final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(
-                str_sel_list), table['table_name'], cond.join(str_cond_list))
-            final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(
-                sql_sel_list), table['table_id'], cond.join(sql_cond_list))
+        cond_str = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' '
+        str_where_conds = cond_str.join(str_cond_list)
+        sql_where_conds = cond_str.join(sql_cond_list)
+        if len(orderby_conds) != 0:
+            str_orderby_column = ', '.join(
+                [header_names[cond[0]] for cond in orderby_conds])
+            sql_orderby_column = ', '.join([
+                '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]])
+                for cond in orderby_conds
+            ])
+            str_orderby_op = self.cond_ops[orderby_conds[0][1]]
+            str_orderby = '%s %s' % (str_orderby_column, str_orderby_op)
+            sql_orderby = '%s %s' % (sql_orderby_column, str_orderby_op)
+            limit_key = orderby_conds[0][2]
+            is_in, limit_num = False, -1
+            for key in self.limit_dict:
+                if key in limit_key:
+                    is_in = True
+                    limit_num = self.limit_dict[key]
+                    break
+            if is_in:
+                str_orderby += ' LIMIT %d' % (limit_num)
+                sql_orderby += ' LIMIT %d' % (limit_num)
+        else:
+            str_orderby = ''
+
+        if len(str_cond_list) != 0 and len(str_orderby) != 0:
+            final_str = 'SELECT %s FROM %s WHERE %s ORDER BY %s' % (
+                ', '.join(str_sel_list), table['table_name'], str_where_conds,
+                str_orderby)
+            final_sql = 'SELECT %s FROM `%s` WHERE %s ORDER BY %s' % (
+                ', '.join(sql_sel_list), table['table_id'], sql_where_conds,
+                sql_orderby)
+        elif len(str_cond_list) != 0:
+            final_str = 'SELECT %s FROM %s WHERE %s' % (
+                ', '.join(str_sel_list), table['table_name'], str_where_conds)
+            final_sql = 'SELECT %s FROM `%s` WHERE %s' % (
+                ', '.join(sql_sel_list), table['table_id'], sql_where_conds)
+        elif len(str_orderby) != 0:
+            final_str = 'SELECT %s FROM %s ORDER BY %s' % (
+                ', '.join(str_sel_list), table['table_name'], str_orderby)
+            final_sql = 'SELECT %s FROM `%s` ORDER BY %s' % (
+                ', '.join(sql_sel_list), table['table_id'], sql_orderby)
         else:
             final_str = 'SELECT %s FROM %s' % (', '.join(str_sel_list),
                                                table['table_name'])
@@ -300,10 +342,10 @@ class TableQuestionAnsweringPipeline(Pipeline):
                 cursor = self.db.connection_obj.cursor().execute(sql.query)
                 header_ids, header_names = [], []
                 for description in cursor.description:
-                    header_ids.append(self.db.tables[result['table_id']]
-                                      ['headerid2name'].get(
-                                          description[0], description[0]))
-                    header_names.append(description[0])
+                    header_names.append(self.db.tables[result['table_id']]
+                                        ['headerid2name'].get(
+                                            description[0], description[0]))
+                    header_ids.append(description[0])
                 rows = []
                 for res in cursor.fetchall():
                     rows.append(list(res))
diff --git a/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py b/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
index 3c2e664b..917e1aaa 100644
--- a/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
+++ b/modelscope/preprocessors/nlp/space_T_cn/fields/struct.py
@@ -179,3 +179,25 @@ class Constant:
         self.max_select_num = 4
 
         self.max_where_num = 6
+
+        self.limit_dict = {
+            '最': 1,
+            '1': 1,
+            '一': 1,
+            '2': 2,
+            '二': 2,
+            '3': 3,
+            '三': 3,
+            '4': 4,
+            '四': 4,
+            '5': 5,
+            '五': 5,
+            '6': 6,
+            '六': 6,
+            '7': 7,
+            '七': 7,
+            '8': 8,
+            '八': 8,
+            '9': 9,
+            '九': 9
+        }
diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py
index eece7f57..825d8f23 100644
--- a/tests/pipelines/test_table_question_answering.py
+++ b/tests/pipelines/test_table_question_answering.py
@@ -128,11 +128,13 @@ class TableQuestionAnswering(unittest.TestCase):
 
         def print_func(pl, i):
             result = pl({
-                'question': '长江流域的小(2)型水库的库容总量是多少？',
-                'table_id': 'reservoir',
+                'question': '上个月收益从低到高排前七的基金的名称和风险等级是什么',
+                'table_id': 'fund',
                 'history_sql': None
             })
-            print(i, json.dumps(result))
+            print(i, result[OutputKeys.OUTPUT][OutputKeys.SQL_QUERY],
+                  result[OutputKeys.OUTPUT][OutputKeys.QUERT_RESULT],
+                  json.dumps(result))
 
         procs = []
         for i in range(5):

From 2c994ed7600abb1ffd32ab8bd681dd7909a6e0d8 Mon Sep 17 00:00:00 2001
From: "wenshen.xws" <wenshen.xws@alibaba-inc.com>
Date: Wed, 26 Oct 2022 16:18:27 +0800
Subject: [PATCH 128/129] [to #42322933]fix tokenizer for faq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

多语言faq，Tokenizer新增类型判别
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10530690
---
 .../nlp/faq_question_answering_preprocessor.py         | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
index 72c8ed99..873a8448 100644
--- a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
@@ -18,11 +18,19 @@ class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
     def __init__(self, model_dir: str, *args, **kwargs):
         super(FaqQuestionAnsweringPreprocessor, self).__init__(
             model_dir, mode=ModeKeys.INFERENCE, **kwargs)
+
         from transformers import BertTokenizer
-        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+
         preprocessor_config = Config.from_file(
             os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
                 ConfigFields.preprocessor, {})
+        if preprocessor_config.get('tokenizer',
+                                   'BertTokenizer') == 'XLMRoberta':
+            from transformers import XLMRobertaTokenizer
+            self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(model_dir)
+
         self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
         self.label_dict = None
 

From e4a0e046f9577436d029e15d874070c4b3e9ce58 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Wed, 26 Oct 2022 16:19:20 +0800
Subject: [PATCH 129/129] [to #42322933] Add ut for mplug and bloom
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

为新上线的 langboat/bloom-1b4-zh，damo/mplug_visual-question-answering_coco_base_zh，damo/mplug_image-captioning_coco_base_zh 三个模型添加 ut，test_level 设置为 2
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10524221
---
 tests/pipelines/test_mplug_tasks.py     | 23 +++++++++++++++++++----
 tests/pipelines/test_text_generation.py | 18 +++++++++---------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
index 11c9798f..21439ce2 100644
--- a/tests/pipelines/test_mplug_tasks.py
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -13,10 +13,6 @@ from modelscope.utils.test_utils import test_level
 
 class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
 
-    def setUp(self) -> None:
-        self.task = 'visual-question-answering'
-        self.model_id = 'damo/mplug_visual-question-answering_coco_large_en'
-
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_with_model(self):
         model = Model.from_pretrained(
@@ -80,6 +76,25 @@ class MplugTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = pipeline_retrieval(input)
         print(result)
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_image_captioning_zh_base_with_name(self):
+        pipeline_caption = pipeline(
+            Tasks.image_captioning,
+            model='damo/mplug_image-captioning_coco_base_zh')
+        image = Image.open('data/test/images/image_mplug_vqa.jpg')
+        result = pipeline_caption(image)
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_visual_question_answering_zh_base_with_name(self):
+        model = 'damo/mplug_visual-question-answering_coco_base_zh'
+        pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
+        image = Image.open('data/test/images/image_mplug_vqa.jpg')
+        text = '这个女人在做什么？'
+        input = {'image': image, 'text': text}
+        result = pipeline_vqa(input)
+        print(result)
+
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index f624f021..ffb30090 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -165,14 +165,16 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         pipeline_ins = pipeline(task=Tasks.text_generation)
         print(pipeline_ins(self.palm_input_zh))
 
-    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
-    def test_demo_compatibility(self):
-        self.compatibility_check()
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_bloom(self):
+        pipe = pipeline(
+            task=Tasks.text_generation, model='langboat/bloom-1b4-zh')
+        print(pipe('中国的首都是'))
 
     @unittest.skip("Langboat's checkpoint has not been uploaded to modelhub")
     def test_gpt_neo(self):
         pipe = pipeline(
-            task=Tasks.text_generation, model='Langboat/mengzi-gpt-neo-base')
+            task=Tasks.text_generation, model='langboat/mengzi-gpt-neo-base')
         print(
             pipe(
                 '我是',
@@ -182,11 +184,9 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
                 max_length=20,
                 repetition_penalty=0.5))
 
-    @unittest.skip("Langboat's checkpoint has not been uploaded to modelhub")
-    def test_bloom(self):
-        pipe = pipeline(
-            task=Tasks.text_generation, model='Langboat/bloom-1b4-zh')
-        print(pipe('中国的首都是'))
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
 
 
 if __name__ == '__main__':