diff --git a/.gitattributes b/.gitattributes
index 88ef2f44..60ff0dd2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,3 +4,4 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
diff --git a/data/test/audios/3ch_nihaomiya.wav b/data/test/audios/3ch_nihaomiya.wav
new file mode 100644
index 00000000..57d9f061
--- /dev/null
+++ b/data/test/audios/3ch_nihaomiya.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ad1a268c614076614a2ae6528abc29cc85ae35826d172079d7d9b26a0299559
+size 4325096
diff --git a/data/test/audios/farend_speech.wav b/data/test/audios/farend_speech.wav
new file mode 100644
index 00000000..4e96d842
--- /dev/null
+++ b/data/test/audios/farend_speech.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3637ee0628d0953f77d5a32327980af542c43230c4127d2a72b4df1ea2ffb0be
+size 320042
diff --git a/data/test/audios/nearend_mic.wav b/data/test/audios/nearend_mic.wav
new file mode 100644
index 00000000..e055c2e0
--- /dev/null
+++ b/data/test/audios/nearend_mic.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc116af609a66f431f94df6b385ff2aa362f8a2d437c2279f5401e47f9178469
+size 320042
diff --git a/data/test/audios/speech_with_noise.wav b/data/test/audios/speech_with_noise.wav
new file mode 100644
index 00000000..d57488c9
--- /dev/null
+++ b/data/test/audios/speech_with_noise.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9354345a6297f4522e690d337546aa9a686a7e61eefcd935478a2141b924db8f
+size 76770
diff --git a/data/test/images/image_salient_detection.jpg b/data/test/images/image_salient_detection.jpg
new file mode 100644
index 00000000..9c0632d3
--- /dev/null
+++ b/data/test/images/image_salient_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70ea0c06f9cfe3882253f7175221d47e394ab9c469076ab220e880b17dbcdd02
+size 48552
diff --git a/data/test/images/ocr_recognition_document.png b/data/test/images/ocr_recognition_document.png
new file mode 100644
index 00000000..d74018bb
--- /dev/null
+++ b/data/test/images/ocr_recognition_document.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29f2ad929c852f6456367054d13e113078cf06b763fe54d73fd324f789331aa3
+size 61611
diff --git a/data/test/videos/dog.avi b/data/test/videos/dog.avi
new file mode 100644
index 00000000..afcda087
--- /dev/null
+++ b/data/test/videos/dog.avi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:469090fb217a34a2c096cfd42c251da69dca9fcd1a3c1faae7d29183c1816c14
+size 12834294
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index d906a80d..09bff2c1 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -362,8 +362,10 @@ class HubApi:
             dataset_name: str,
             namespace: str,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
-        return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
-               f'Revision={revision}&FilePath={file_name}'
+        if file_name.endswith('.csv'):
+            file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
+                        f'Revision={revision}&FilePath={file_name}'
+        return file_name
 
     def get_dataset_access_config(
             self,
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index a0aab6d3..0bc16026 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -38,6 +38,7 @@ class Models(object):
     # audio models
     sambert_hifigan = 'sambert-hifigan'
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     kws_kwsbp = 'kws-kwsbp'
     generic_asr = 'generic-asr'
 
@@ -86,6 +87,7 @@ class Pipelines(object):
     body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
     human_detection = 'resnet18-human-detection'
     object_detection = 'vit-object-detection'
+    salient_detection = 'u2net-salient-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
     live_category = 'live-category'
@@ -109,6 +111,7 @@ class Pipelines(object):
     skin_retouching = 'unet-skin-retouching'
     tinynas_classification = 'tinynas-classification'
     crowd_counting = 'hrnet-crowd-counting'
+    video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
@@ -132,6 +135,7 @@ class Pipelines(object):
     sambert_hifigan_tts = 'sambert-hifigan-tts'
     speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     kws_kwsbp = 'kws-kwsbp'
     asr_inference = 'asr-inference'
 
@@ -215,7 +219,7 @@ class Preprocessors(object):
 
     # multi-modal preprocessor
     ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
-    mplug_visual_question_answering = 'mplug-visual-question-answering'
+    mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
 
 
 class Metrics(object):
diff --git a/modelscope/models/audio/kws/__init__.py b/modelscope/models/audio/kws/__init__.py
index f3db5e08..dd183fe5 100644
--- a/modelscope/models/audio/kws/__init__.py
+++ b/modelscope/models/audio/kws/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .generic_key_word_spotting import GenericKeyWordSpotting
+    from .farfield.model import FSMNSeleNetV2Decorator
 
 else:
     _import_structure = {
         'generic_key_word_spotting': ['GenericKeyWordSpotting'],
+        'farfield.model': ['FSMNSeleNetV2Decorator'],
     }
 
     import sys
diff --git a/modelscope/models/audio/kws/farfield/__init__.py b/modelscope/models/audio/kws/farfield/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/kws/farfield/fsmn.py b/modelscope/models/audio/kws/farfield/fsmn.py
new file mode 100644
index 00000000..e88d3976
--- /dev/null
+++ b/modelscope/models/audio/kws/farfield/fsmn.py
@@ -0,0 +1,495 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .model_def import (HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32,
+                        printNeonMatrix, printNeonVector)
+
+DEBUG = False
+
+
+def to_kaldi_matrix(np_mat):
+    """ function that transform as str numpy mat to standard kaldi str matrix
+
+        Args:
+            np_mat:          numpy mat
+
+        Returns:  str
+    """
+    np.set_printoptions(threshold=np.inf, linewidth=np.nan)
+    out_str = str(np_mat)
+    out_str = out_str.replace('[', '')
+    out_str = out_str.replace(']', '')
+    return '[ %s ]\n' % out_str
+
+
+def print_tensor(torch_tensor):
+    """ print torch tensor for debug
+
+    Args:
+        torch_tensor:           a tensor
+    """
+    re_str = ''
+    x = torch_tensor.detach().squeeze().numpy()
+    re_str += to_kaldi_matrix(x)
+    re_str += '<!EndOfComponent>\n'
+    print(re_str)
+
+
+class LinearTransform(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(LinearTransform, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.linear = nn.Linear(input_dim, output_dim, bias=False)
+
+        self.debug = False
+        self.dataout = None
+
+    def forward(self, input):
+        output = self.linear(input)
+
+        if self.debug:
+            self.dataout = output
+
+        return output
+
+    def print_model(self):
+        printNeonMatrix(self.linear.weight)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<LinearTransform> %d %d\n' % (self.output_dim,
+                                                 self.input_dim)
+        re_str += '<LearnRateCoef> 1\n'
+
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        re_str += '<!EndOfComponent>\n'
+
+        return re_str
+
+
+class AffineTransform(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(AffineTransform, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        self.linear = nn.Linear(input_dim, output_dim)
+
+        self.debug = False
+        self.dataout = None
+
+    def forward(self, input):
+        output = self.linear(input)
+
+        if self.debug:
+            self.dataout = output
+
+        return output
+
+    def print_model(self):
+        printNeonMatrix(self.linear.weight)
+        printNeonVector(self.linear.bias)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
+                                                 self.input_dim)
+        re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
+
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        re_str += to_kaldi_matrix(x)
+        re_str += '<!EndOfComponent>\n'
+
+        return re_str
+
+
+class Fsmn(nn.Module):
+    """
+    FSMN implementation.
+    """
+
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 lorder=None,
+                 rorder=None,
+                 lstride=None,
+                 rstride=None):
+        super(Fsmn, self).__init__()
+
+        self.dim = input_dim
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.rorder = rorder
+        self.lstride = lstride
+        self.rstride = rstride
+
+        self.conv_left = nn.Conv2d(
+            self.dim,
+            self.dim, (lorder, 1),
+            dilation=(lstride, 1),
+            groups=self.dim,
+            bias=False)
+
+        if rorder > 0:
+            self.conv_right = nn.Conv2d(
+                self.dim,
+                self.dim, (rorder, 1),
+                dilation=(rstride, 1),
+                groups=self.dim,
+                bias=False)
+        else:
+            self.conv_right = None
+
+        self.debug = False
+        self.dataout = None
+
+    def forward(self, input):
+        x = torch.unsqueeze(input, 1)
+        x_per = x.permute(0, 3, 2, 1)
+
+        y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])
+
+        if self.conv_right is not None:
+            y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
+            y_right = y_right[:, :, self.rstride:, :]
+            out = x_per + self.conv_left(y_left) + self.conv_right(y_right)
+        else:
+            out = x_per + self.conv_left(y_left)
+
+        out1 = out.permute(0, 3, 2, 1)
+        output = out1.squeeze(1)
+
+        if self.debug:
+            self.dataout = output
+
+        return output
+
+    def print_model(self):
+        tmpw = self.conv_left.weight
+        tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
+        for j in range(tmpw.shape[0]):
+            tmpwm[:, j] = tmpw[j, 0, :, 0]
+
+        printNeonMatrix(tmpwm)
+
+        if self.conv_right is not None:
+            tmpw = self.conv_right.weight
+            tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
+            for j in range(tmpw.shape[0]):
+                tmpwm[:, j] = tmpw[j, 0, :, 0]
+
+            printNeonMatrix(tmpwm)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<Fsmn> %d %d\n' % (self.dim, self.dim)
+        re_str += '<LearnRateCoef> %d <LOrder> %d <ROrder> %d <LStride> %d <RStride> %d <MaxNorm> 0\n' % (
+            1, self.lorder, self.rorder, self.lstride, self.rstride)
+
+        lfiters = self.state_dict()['conv_left.weight']
+        x = np.flipud(lfiters.squeeze().numpy().T)
+        re_str += to_kaldi_matrix(x)
+
+        if self.conv_right is not None:
+            rfiters = self.state_dict()['conv_right.weight']
+            x = (rfiters.squeeze().numpy().T)
+            re_str += to_kaldi_matrix(x)
+            re_str += '<!EndOfComponent>\n'
+
+        return re_str
+
+
+class RectifiedLinear(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(RectifiedLinear, self).__init__()
+        self.dim = input_dim
+        self.relu = nn.ReLU()
+
+    def forward(self, input):
+        return self.relu(input)
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
+        re_str += '<!EndOfComponent>\n'
+        return re_str
+
+
+class FSMNNet(nn.Module):
+    """
+    FSMN net for keyword spotting
+    """
+
+    def __init__(self,
+                 input_dim=200,
+                 linear_dim=128,
+                 proj_dim=128,
+                 lorder=10,
+                 rorder=1,
+                 num_syn=5,
+                 fsmn_layers=4):
+        """
+        Args:
+            input_dim:              input dimension
+            linear_dim:             fsmn input dimension
+            proj_dim:               fsmn projection dimension
+            lorder:                 fsmn left order
+            rorder:                 fsmn right order
+            num_syn:                output dimension
+            fsmn_layers:            no. of sequential fsmn layers
+        """
+        super(FSMNNet, self).__init__()
+
+        self.input_dim = input_dim
+        self.linear_dim = linear_dim
+        self.proj_dim = proj_dim
+        self.lorder = lorder
+        self.rorder = rorder
+        self.num_syn = num_syn
+        self.fsmn_layers = fsmn_layers
+
+        self.linear1 = AffineTransform(input_dim, linear_dim)
+        self.relu = RectifiedLinear(linear_dim, linear_dim)
+
+        self.fsmn = self._build_repeats(linear_dim, proj_dim, lorder, rorder,
+                                        fsmn_layers)
+
+        self.linear2 = AffineTransform(linear_dim, num_syn)
+
+    @staticmethod
+    def _build_repeats(linear_dim=136,
+                       proj_dim=68,
+                       lorder=3,
+                       rorder=2,
+                       fsmn_layers=5):
+        repeats = [
+            nn.Sequential(
+                LinearTransform(linear_dim, proj_dim),
+                Fsmn(proj_dim, proj_dim, lorder, rorder, 1, 1),
+                AffineTransform(proj_dim, linear_dim),
+                RectifiedLinear(linear_dim, linear_dim))
+            for i in range(fsmn_layers)
+        ]
+
+        return nn.Sequential(*repeats)
+
+    def forward(self, input):
+        x1 = self.linear1(input)
+        x2 = self.relu(x1)
+        x3 = self.fsmn(x2)
+        x4 = self.linear2(x3)
+        return x4
+
+    def print_model(self):
+        self.linear1.print_model()
+
+        for layer in self.fsmn:
+            layer[0].print_model()
+            layer[1].print_model()
+            layer[2].print_model()
+
+        self.linear2.print_model()
+
+    def print_header(self):
+        #
+        # write total header
+        #
+        header = [0.0] * HEADER_BLOCK_SIZE * 4
+        # numins
+        header[0] = 0.0
+        # numouts
+        header[1] = 0.0
+        # dimins
+        header[2] = self.input_dim
+        # dimouts
+        header[3] = self.num_syn
+        # numlayers
+        header[4] = 3
+
+        #
+        # write each layer's header
+        #
+        hidx = 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DENSE.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = self.input_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = self.linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
+        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
+            ActivationType.ACTIVATION_RELU.value)
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_SEQUENTIAL_FSMN.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = self.proj_dim
+        header[HEADER_BLOCK_SIZE * hidx + 4] = self.lorder
+        header[HEADER_BLOCK_SIZE * hidx + 5] = self.rorder
+        header[HEADER_BLOCK_SIZE * hidx + 6] = self.fsmn_layers
+        header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DENSE.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = self.num_syn
+        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
+        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
+            ActivationType.ACTIVATION_SOFTMAX.value)
+
+        for h in header:
+            print(f32ToI32(h))
+
+    def to_kaldi_nnet(self):
+        re_str = ''
+        re_str += '<Nnet>\n'
+        re_str += self.linear1.to_kaldi_nnet()
+        re_str += self.relu.to_kaldi_nnet()
+
+        for fsmn in self.fsmn:
+            re_str += fsmn[0].to_kaldi_nnet()
+            re_str += fsmn[1].to_kaldi_nnet()
+            re_str += fsmn[2].to_kaldi_nnet()
+            re_str += fsmn[3].to_kaldi_nnet()
+
+        re_str += self.linear2.to_kaldi_nnet()
+        re_str += '<Softmax> %d %d\n' % (self.num_syn, self.num_syn)
+        re_str += '<!EndOfComponent>\n'
+        re_str += '</Nnet>\n'
+
+        return re_str
+
+
+class DFSMN(nn.Module):
+    """
+    One deep fsmn layer
+    """
+
+    def __init__(self,
+                 dimproj=64,
+                 dimlinear=128,
+                 lorder=20,
+                 rorder=1,
+                 lstride=1,
+                 rstride=1):
+        """
+        Args:
+            dimproj:                projection dimension, input and output dimension of memory blocks
+            dimlinear:              dimension of mapping layer
+            lorder:                 left order
+            rorder:                 right order
+            lstride:                left stride
+            rstride:                right stride
+        """
+        super(DFSMN, self).__init__()
+
+        self.lorder = lorder
+        self.rorder = rorder
+        self.lstride = lstride
+        self.rstride = rstride
+
+        self.expand = AffineTransform(dimproj, dimlinear)
+        self.shrink = LinearTransform(dimlinear, dimproj)
+
+        self.conv_left = nn.Conv2d(
+            dimproj,
+            dimproj, (lorder, 1),
+            dilation=(lstride, 1),
+            groups=dimproj,
+            bias=False)
+
+        if rorder > 0:
+            self.conv_right = nn.Conv2d(
+                dimproj,
+                dimproj, (rorder, 1),
+                dilation=(rstride, 1),
+                groups=dimproj,
+                bias=False)
+        else:
+            self.conv_right = None
+
+    def forward(self, input):
+        f1 = F.relu(self.expand(input))
+        p1 = self.shrink(f1)
+
+        x = torch.unsqueeze(p1, 1)
+        x_per = x.permute(0, 3, 2, 1)
+
+        y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])
+
+        if self.conv_right is not None:
+            y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
+            y_right = y_right[:, :, self.rstride:, :]
+            out = x_per + self.conv_left(y_left) + self.conv_right(y_right)
+        else:
+            out = x_per + self.conv_left(y_left)
+
+        out1 = out.permute(0, 3, 2, 1)
+        output = input + out1.squeeze(1)
+
+        return output
+
+    def print_model(self):
+        self.expand.print_model()
+        self.shrink.print_model()
+
+        tmpw = self.conv_left.weight
+        tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
+        for j in range(tmpw.shape[0]):
+            tmpwm[:, j] = tmpw[j, 0, :, 0]
+
+        printNeonMatrix(tmpwm)
+
+        if self.conv_right is not None:
+            tmpw = self.conv_right.weight
+            tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
+            for j in range(tmpw.shape[0]):
+                tmpwm[:, j] = tmpw[j, 0, :, 0]
+
+            printNeonMatrix(tmpwm)
+
+
+def build_dfsmn_repeats(linear_dim=128,
+                        proj_dim=64,
+                        lorder=20,
+                        rorder=1,
+                        fsmn_layers=6):
+    """
+    build stacked dfsmn layers
+    Args:
+        linear_dim:
+        proj_dim:
+        lorder:
+        rorder:
+        fsmn_layers:
+
+    Returns:
+
+    """
+    repeats = [
+        nn.Sequential(DFSMN(proj_dim, linear_dim, lorder, rorder, 1, 1))
+        for i in range(fsmn_layers)
+    ]
+
+    return nn.Sequential(*repeats)
diff --git a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
new file mode 100644
index 00000000..1884e533
--- /dev/null
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
@@ -0,0 +1,236 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
+from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32
+
+
+class FSMNUnit(nn.Module):
+    """ A multi-channel fsmn unit
+
+    """
+
+    def __init__(self, dimlinear=128, dimproj=64, lorder=20, rorder=1):
+        """
+        Args:
+            dimlinear:              input / output dimension
+            dimproj:                fsmn input / output dimension
+            lorder:                 left ofder
+            rorder:                 right order
+        """
+        super(FSMNUnit, self).__init__()
+
+        self.shrink = LinearTransform(dimlinear, dimproj)
+        self.fsmn = Fsmn(dimproj, dimproj, lorder, rorder, 1, 1)
+        self.expand = AffineTransform(dimproj, dimlinear)
+
+        self.debug = False
+        self.dataout = None
+
+    '''
+    batch, time, channel, feature
+    '''
+
+    def forward(self, input):
+        if torch.cuda.is_available():
+            out = torch.zeros(input.shape).cuda()
+        else:
+            out = torch.zeros(input.shape)
+
+        for n in range(input.shape[2]):
+            out1 = self.shrink(input[:, :, n, :])
+            out2 = self.fsmn(out1)
+            out[:, :, n, :] = F.relu(self.expand(out2))
+
+        if self.debug:
+            self.dataout = out
+
+        return out
+
+    def print_model(self):
+        self.shrink.print_model()
+        self.fsmn.print_model()
+        self.expand.print_model()
+
+    def to_kaldi_nnet(self):
+        re_str = self.shrink.to_kaldi_nnet()
+        re_str += self.fsmn.to_kaldi_nnet()
+        re_str += self.expand.to_kaldi_nnet()
+
+        relu = RectifiedLinear(self.expand.linear.out_features,
+                               self.expand.linear.out_features)
+        re_str += relu.to_kaldi_nnet()
+
+        return re_str
+
+
+class FSMNSeleNetV2(nn.Module):
+    """ FSMN model with channel selection.
+    """
+
+    def __init__(self,
+                 input_dim=120,
+                 linear_dim=128,
+                 proj_dim=64,
+                 lorder=20,
+                 rorder=1,
+                 num_syn=5,
+                 fsmn_layers=5,
+                 sele_layer=0):
+        """
+        Args:
+            input_dim:              input dimension
+            linear_dim:             fsmn input dimension
+            proj_dim:               fsmn projection dimension
+            lorder:                 fsmn left order
+            rorder:                 fsmn right order
+            num_syn:                output dimension
+            fsmn_layers:            no. of fsmn units
+            sele_layer:             channel selection layer index
+        """
+        super(FSMNSeleNetV2, self).__init__()
+
+        self.sele_layer = sele_layer
+
+        self.featmap = AffineTransform(input_dim, linear_dim)
+
+        self.mem = []
+        for i in range(fsmn_layers):
+            unit = FSMNUnit(linear_dim, proj_dim, lorder, rorder)
+            self.mem.append(unit)
+            self.add_module('mem_{:d}'.format(i), unit)
+
+        self.decision = AffineTransform(linear_dim, num_syn)
+
+    def forward(self, input):
+        # multi-channel feature mapping
+        if torch.cuda.is_available():
+            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
+                            self.featmap.linear.out_features).cuda()
+        else:
+            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
+                            self.featmap.linear.out_features)
+
+        for n in range(input.shape[2]):
+            x[:, :, n, :] = F.relu(self.featmap(input[:, :, n, :]))
+
+        for i, unit in enumerate(self.mem):
+            y = unit(x)
+
+            # perform channel selection
+            if i == self.sele_layer:
+                pool = nn.MaxPool2d((y.shape[2], 1), stride=(y.shape[2], 1))
+                y = pool(y)
+
+            x = y
+
+        # remove channel dimension
+        y = torch.squeeze(y, -2)
+        z = self.decision(y)
+
+        return z
+
+    def print_model(self):
+        self.featmap.print_model()
+
+        for unit in self.mem:
+            unit.print_model()
+
+        self.decision.print_model()
+
+    def print_header(self):
+        '''
+        get FSMN params
+        '''
+        input_dim = self.featmap.linear.in_features
+        linear_dim = self.featmap.linear.out_features
+        proj_dim = self.mem[0].shrink.linear.out_features
+        lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
+        rorder = 0
+        if self.mem[0].fsmn.conv_right is not None:
+            rorder = self.mem[0].fsmn.conv_right.kernel_size[0]
+
+        num_syn = self.decision.linear.out_features
+        fsmn_layers = len(self.mem)
+
+        # no. of output channels, 0.0 means the same as numins
+        # numouts = 0.0
+        numouts = 1.0
+
+        #
+        # write total header
+        #
+        header = [0.0] * HEADER_BLOCK_SIZE * 4
+        # numins
+        header[0] = 0.0
+        # numouts
+        header[1] = numouts
+        # dimins
+        header[2] = input_dim
+        # dimouts
+        header[3] = num_syn
+        # numlayers
+        header[4] = 3
+
+        #
+        # write each layer's header
+        #
+        hidx = 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DENSE.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
+        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
+            ActivationType.ACTIVATION_RELU.value)
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_SEQUENTIAL_FSMN.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = proj_dim
+        header[HEADER_BLOCK_SIZE * hidx + 4] = lorder
+        header[HEADER_BLOCK_SIZE * hidx + 5] = rorder
+        header[HEADER_BLOCK_SIZE * hidx + 6] = fsmn_layers
+        if numouts == 1.0:
+            header[HEADER_BLOCK_SIZE * hidx + 7] = float(self.sele_layer)
+        else:
+            header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DENSE.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
+        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
+        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
+        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
+            ActivationType.ACTIVATION_SOFTMAX.value)
+
+        for h in header:
+            print(f32ToI32(h))
+
+    def to_kaldi_nnet(self):
+        re_str = '<Nnet>\n'
+
+        re_str = self.featmap.to_kaldi_nnet()
+
+        relu = RectifiedLinear(self.featmap.linear.out_features,
+                               self.featmap.linear.out_features)
+        re_str += relu.to_kaldi_nnet()
+
+        for unit in self.mem:
+            re_str += unit.to_kaldi_nnet()
+
+        re_str += self.decision.to_kaldi_nnet()
+
+        re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
+                                         self.decision.linear.out_features)
+        re_str += '<!EndOfComponent>\n'
+        re_str += '</Nnet>\n'
+
+        return re_str
diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py
new file mode 100644
index 00000000..81e47350
--- /dev/null
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -0,0 +1,74 @@
+import os
+from typing import Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .fsmn_sele_v2 import FSMNSeleNetV2
+
+
+@MODELS.register_module(
+    Tasks.keyword_spotting, module_name=Models.speech_dfsmn_kws_char_farfield)
+class FSMNSeleNetV2Decorator(TorchModel):
+    r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """
+
+    MODEL_TXT = 'model.txt'
+    SC_CONFIG = 'sound_connect.conf'
+    SC_CONF_ITEM_KWS_MODEL = '${kws_model}'
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the dfsmn model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
+        model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
+        model_bin_file = os.path.join(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE)
+        self._model = None
+        if os.path.exists(model_bin_file):
+            self._model = FSMNSeleNetV2(*args, **kwargs)
+            checkpoint = torch.load(model_bin_file)
+            self._model.load_state_dict(checkpoint, strict=False)
+
+        self._sc = None
+        if os.path.exists(model_txt_file):
+            with open(sc_config_file) as f:
+                lines = f.readlines()
+            with open(sc_config_file, 'w') as f:
+                for line in lines:
+                    if self.SC_CONF_ITEM_KWS_MODEL in line:
+                        line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
+                                            model_txt_file)
+                    f.write(line)
+            import py_sound_connect
+            self._sc = py_sound_connect.SoundConnect(sc_config_file)
+            self.size_in = self._sc.bytesPerBlockIn()
+            self.size_out = self._sc.bytesPerBlockOut()
+
+        if self._model is None and self._sc is None:
+            raise Exception(
+                f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
+            )
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        ...
+
+    def forward_decode(self, data: bytes):
+        result = {'pcm': self._sc.process(data, self.size_out)}
+        state = self._sc.kwsState()
+        if state == 2:
+            result['kws'] = {
+                'keyword':
+                self._sc.kwsKeyword(self._sc.kwsSpottedKeywordIndex()),
+                'offset': self._sc.kwsKeywordOffset(),
+                'length': self._sc.kwsKeywordLength(),
+                'confidence': self._sc.kwsConfidence()
+            }
+        return result
diff --git a/modelscope/models/audio/kws/farfield/model_def.py b/modelscope/models/audio/kws/farfield/model_def.py
new file mode 100644
index 00000000..3f5ba7d7
--- /dev/null
+++ b/modelscope/models/audio/kws/farfield/model_def.py
@@ -0,0 +1,121 @@
+import math
+import struct
+from enum import Enum
+
+HEADER_BLOCK_SIZE = 10
+
+
+class LayerType(Enum):
+    LAYER_DENSE = 1
+    LAYER_GRU = 2
+    LAYER_ATTENTION = 3
+    LAYER_FSMN = 4
+    LAYER_SEQUENTIAL_FSMN = 5
+    LAYER_FSMN_SELE = 6
+    LAYER_GRU_ATTENTION = 7
+    LAYER_DFSMN = 8
+
+
+class ActivationType(Enum):
+    ACTIVATION_NONE = 0
+    ACTIVATION_RELU = 1
+    ACTIVATION_TANH = 2
+    ACTIVATION_SIGMOID = 3
+    ACTIVATION_SOFTMAX = 4
+    ACTIVATION_LOGSOFTMAX = 5
+
+
+def f32ToI32(f):
+    """
+    print layer
+    """
+    bs = struct.pack('f', f)
+
+    ba = bytearray()
+    ba.append(bs[0])
+    ba.append(bs[1])
+    ba.append(bs[2])
+    ba.append(bs[3])
+
+    return struct.unpack('i', ba)[0]
+
+
+def printNeonMatrix(w):
+    """
+    print matrix with neon padding
+    """
+    numrows, numcols = w.shape
+    numnecols = math.ceil(numcols / 4)
+
+    for i in range(numrows):
+        for j in range(numcols):
+            print(f32ToI32(w[i, j]))
+
+        for j in range(numnecols * 4 - numcols):
+            print(0)
+
+
+def printNeonVector(b):
+    """
+    print vector with neon padding
+    """
+    size = b.shape[0]
+    nesize = math.ceil(size / 4)
+
+    for i in range(size):
+        print(f32ToI32(b[i]))
+
+    for i in range(nesize * 4 - size):
+        print(0)
+
+
+def printDense(layer):
+    """
+    save dense layer
+    """
+    statedict = layer.state_dict()
+    printNeonMatrix(statedict['weight'])
+    printNeonVector(statedict['bias'])
+
+
+def printGRU(layer):
+    """
+    save gru layer
+    """
+    statedict = layer.state_dict()
+    weight = [statedict['weight_ih_l0'], statedict['weight_hh_l0']]
+    bias = [statedict['bias_ih_l0'], statedict['bias_hh_l0']]
+    numins, numouts = weight[0].shape
+    numins = numins // 3
+
+    # output input weights
+    w_rx = weight[0][:numins, :]
+    w_zx = weight[0][numins:numins * 2, :]
+    w_x = weight[0][numins * 2:, :]
+    printNeonMatrix(w_zx)
+    printNeonMatrix(w_rx)
+    printNeonMatrix(w_x)
+
+    # output recurrent weights
+    w_rh = weight[1][:numins, :]
+    w_zh = weight[1][numins:numins * 2, :]
+    w_h = weight[1][numins * 2:, :]
+    printNeonMatrix(w_zh)
+    printNeonMatrix(w_rh)
+    printNeonMatrix(w_h)
+
+    # output input bias
+    b_rx = bias[0][:numins]
+    b_zx = bias[0][numins:numins * 2]
+    b_x = bias[0][numins * 2:]
+    printNeonVector(b_zx)
+    printNeonVector(b_rx)
+    printNeonVector(b_x)
+
+    # output recurrent bias
+    b_rh = bias[1][:numins]
+    b_zh = bias[1][numins:numins * 2]
+    b_h = bias[1][numins * 2:]
+    printNeonVector(b_zh)
+    printNeonVector(b_rh)
+    printNeonVector(b_h)
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index a05bc57d..f2ecd08e 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -5,4 +5,5 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                image_colorization, image_denoise, image_instance_segmentation,
                image_portrait_enhancement, image_to_image_generation,
                image_to_image_translation, object_detection,
-               product_retrieval_embedding, super_resolution, virual_tryon)
+               product_retrieval_embedding, salient_detection,
+               super_resolution, video_single_object_tracking, virual_tryon)
diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
index eaf5d0c5..c484b37b 100644
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel):
         model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
         self.model = NAFNet(**self.config.model.network_g)
         self.loss = PSNRLoss()
-
-        if torch.cuda.is_available():
-            self._device = torch.device('cuda')
-        else:
-            self._device = torch.device('cpu')
-
-        self.model = self.model.to(self._device)
         self.model = self._load_pretrained(self.model, model_path)
 
-        if self.training:
-            self.model.train()
-        else:
-            self.model.eval()
-
     def _load_pretrained(self,
                          net,
                          load_path,
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel):
         Returns:
             Dict[str, Tensor]: results
         """
-        for key, value in inputs.items():
-            inputs[key] = inputs[key].to(self._device)
         if self.training:
             return self._train_forward(**inputs)
         elif 'target' in inputs:
diff --git a/modelscope/models/cv/image_instance_segmentation/__init__.py b/modelscope/models/cv/image_instance_segmentation/__init__.py
index 4706f8f8..8ccfef4b 100644
--- a/modelscope/models/cv/image_instance_segmentation/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/__init__.py
@@ -7,13 +7,11 @@ if TYPE_CHECKING:
     from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin
     from .model import CascadeMaskRCNNSwinModel
     from .postprocess_utils import get_img_ins_seg_result
-    from .datasets import ImageInstanceSegmentationCocoDataset
 else:
     _import_structure = {
         'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
         'model': ['CascadeMaskRCNNSwinModel'],
         'postprocess_utils': ['get_img_ins_seg_result'],
-        'datasets': ['ImageInstanceSegmentationCocoDataset']
     }
 
     import sys
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
index 93c71b46..cca1432f 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/__init__.py
@@ -1,2 +1 @@
-from .dataset import ImageInstanceSegmentationCocoDataset
 from .transforms import build_preprocess_transform
diff --git a/modelscope/models/cv/object_detection/mmdet_model.py b/modelscope/models/cv/object_detection/mmdet_model.py
index 51f05e47..7bf81349 100644
--- a/modelscope/models/cv/object_detection/mmdet_model.py
+++ b/modelscope/models/cv/object_detection/mmdet_model.py
@@ -38,7 +38,7 @@ class DetectionModel(TorchModel):
             self.model, model_path, map_location='cpu')
         self.class_names = checkpoint['meta']['CLASSES']
         config.test_pipeline[0].type = 'LoadImageFromWebcam'
-        self.test_pipeline = Compose(
+        self.transform_input = Compose(
             replace_ImageToTensor(config.test_pipeline))
         self.model.cfg = config
         self.model.eval()
@@ -56,7 +56,7 @@ class DetectionModel(TorchModel):
 
         from mmcv.parallel import collate, scatter
         data = dict(img=image)
-        data = self.test_pipeline(data)
+        data = self.transform_input(data)
         data = collate([data], samples_per_gpu=1)
         data['img_metas'] = [
             img_metas.data[0] for img_metas in data['img_metas']
diff --git a/modelscope/models/cv/salient_detection/__init__.py b/modelscope/models/cv/salient_detection/__init__.py
new file mode 100644
index 00000000..b3b5b5fa
--- /dev/null
+++ b/modelscope/models/cv/salient_detection/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .salient_model import SalientDetection
+
+else:
+    _import_structure = {
+        'salient_model': ['SalientDetection'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/salient_detection/models/__init__.py b/modelscope/models/cv/salient_detection/models/__init__.py
new file mode 100644
index 00000000..0850c33d
--- /dev/null
+++ b/modelscope/models/cv/salient_detection/models/__init__.py
@@ -0,0 +1 @@
+from .u2net import U2NET
diff --git a/modelscope/models/cv/salient_detection/models/u2net.py b/modelscope/models/cv/salient_detection/models/u2net.py
new file mode 100644
index 00000000..0a0a4511
--- /dev/null
+++ b/modelscope/models/cv/salient_detection/models/u2net.py
@@ -0,0 +1,300 @@
+# Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class REBNCONV(nn.Module):
+
+    def __init__(self, in_ch=3, out_ch=3, dirate=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(
+            in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate)
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        hx = x
+        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+        return xout
+
+
+def _upsample_like(src, tar):
+    """upsample tensor 'src' to have the same spatial size with tensor 'tar'."""
+    src = F.upsample(src, size=tar.shape[2:], mode='bilinear')
+    return src
+
+
+class RSU7(nn.Module):
+
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU7, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx = self.pool5(hx5)
+        hx6 = self.rebnconv6(hx)
+        hx7 = self.rebnconv7(hx6)
+        hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
+        hx6dup = _upsample_like(hx6d, hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+
+
+class RSU6(nn.Module):
+
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU6, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx6 = self.rebnconv6(hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+
+
+class RSU5(nn.Module):
+
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU5, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx5 = self.rebnconv5(hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+
+
+class RSU4(nn.Module):
+
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+
+
+class RSU4F(nn.Module):
+
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4F, self).__init__()
+
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+
+    def forward(self, x):
+
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx2 = self.rebnconv2(hx1)
+        hx3 = self.rebnconv3(hx2)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
+        hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
+        return hx1d + hxin
+
+
+class U2NET(nn.Module):
+
+    def __init__(self, in_ch=3, out_ch=1):
+        super(U2NET, self).__init__()
+
+        # encoder
+        self.stage1 = RSU7(in_ch, 32, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 32, 128)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(128, 64, 256)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(256, 128, 512)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(512, 256, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 256, 512)
+        # decoder
+        self.stage5d = RSU4F(1024, 256, 512)
+        self.stage4d = RSU4(1024, 128, 256)
+        self.stage3d = RSU5(512, 64, 128)
+        self.stage2d = RSU6(256, 32, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.outconv = nn.Conv2d(6 * out_ch, out_ch, 1)
+
+    def forward(self, x):
+
+        hx = x
+        hx1 = self.stage1(hx)
+        hx = self.pool12(hx1)
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        d1 = self.side1(hx1d)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, d1)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, d1)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, d1)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, d1)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, d1)
+        d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
+        return torch.sigmoid(d0), torch.sigmoid(d1), torch.sigmoid(
+            d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(
+                d5), torch.sigmoid(d6)
diff --git a/modelscope/models/cv/salient_detection/salient_model.py b/modelscope/models/cv/salient_detection/salient_model.py
new file mode 100644
index 00000000..539d1f24
--- /dev/null
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -0,0 +1,63 @@
+import os.path as osp
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .models import U2NET
+
+
+@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection)
+class SalientDetection(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, *args, **kwargs)
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.model = U2NET(3, 1)
+        checkpoint = torch.load(model_path, map_location='cpu')
+        self.transform_input = transforms.Compose([
+            transforms.Resize((320, 320)),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        self.model.load_state_dict(checkpoint)
+        self.model.eval()
+
+    def inference(self, data):
+        """data is tensor 3 * H * W ---> return tensor H * W ."""
+        data = data.unsqueeze(0)
+        if next(self.model.parameters()).is_cuda:
+            data = data.to(
+                torch.device([next(self.model.parameters()).device][0]))
+
+        with torch.no_grad():
+            results = self.model(data)
+
+        if next(self.model.parameters()).is_cuda:
+            return results[0][0, 0, :, :].cpu()
+        return results[0][0, 0, :, :]
+
+    def preprocess(self, image):
+        """image is numpy."""
+        data = self.transform_input(Image.fromarray(image))
+        return data.float()
+
+    def postprocess(self, inputs):
+        """resize ."""
+        data = inputs['data']
+        w = inputs['img_w']
+        h = inputs['img_h']
+        data_norm = (data - torch.min(data)) / (
+            torch.max(data) - torch.min(data))
+        data_norm_np = (data_norm.numpy() * 255).astype('uint8')
+        data_norm_rst = cv2.resize(data_norm_np, (w, h))
+
+        return data_norm_rst
diff --git a/modelscope/models/cv/video_single_object_tracking/__init__.py b/modelscope/models/cv/video_single_object_tracking/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/config/__init__.py b/modelscope/models/cv/video_single_object_tracking/config/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
new file mode 100644
index 00000000..772813cf
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
@@ -0,0 +1,39 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+from easydict import EasyDict as edict
+
+cfg = edict()
+
+# MODEL
+cfg.MODEL = edict()
+
+# MODEL.BACKBONE
+cfg.MODEL.BACKBONE = edict()
+cfg.MODEL.BACKBONE.TYPE = 'vit_base_patch16_224_ce'
+cfg.MODEL.BACKBONE.STRIDE = 16
+cfg.MODEL.BACKBONE.CAT_MODE = 'direct'
+cfg.MODEL.BACKBONE.DROP_PATH_RATE = 0.1
+cfg.MODEL.BACKBONE.CE_LOC = [3, 6, 9]
+cfg.MODEL.BACKBONE.CE_KEEP_RATIO = [0.7, 0.7, 0.7]
+cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE = 'CTR_POINT'
+
+# MODEL.HEAD
+cfg.MODEL.HEAD = edict()
+cfg.MODEL.HEAD.TYPE = 'CENTER'
+cfg.MODEL.HEAD.NUM_CHANNELS = 256
+
+# DATA
+cfg.DATA = edict()
+cfg.DATA.MEAN = [0.485, 0.456, 0.406]
+cfg.DATA.STD = [0.229, 0.224, 0.225]
+cfg.DATA.SEARCH = edict()
+cfg.DATA.SEARCH.SIZE = 384
+cfg.DATA.TEMPLATE = edict()
+cfg.DATA.TEMPLATE.SIZE = 192
+
+# TEST
+cfg.TEST = edict()
+cfg.TEST.TEMPLATE_FACTOR = 2.0
+cfg.TEST.TEMPLATE_SIZE = 192
+cfg.TEST.SEARCH_FACTOR = 5.0
+cfg.TEST.SEARCH_SIZE = 384
diff --git a/modelscope/models/cv/video_single_object_tracking/models/__init__.py b/modelscope/models/cv/video_single_object_tracking/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/__init__.py b/modelscope/models/cv/video_single_object_tracking/models/layers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
new file mode 100644
index 00000000..158d88aa
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
@@ -0,0 +1,54 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch.nn as nn
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 rpe=False,
+                 z_size=7,
+                 x_size=14):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, mask=None, return_attention=False):
+        # x: B, N, C
+        # mask: [B, N, ] torch.bool
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(
+            0)  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+
+        if mask is not None:
+            attn = attn.masked_fill(
+                mask.unsqueeze(1).unsqueeze(2),
+                float('-inf'),
+            )
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        if return_attention:
+            return x, attn
+        else:
+            return x
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
new file mode 100644
index 00000000..45706f71
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
@@ -0,0 +1,129 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import math
+
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath, Mlp
+
+from .attn import Attention
+
+
+def candidate_elimination(attn: torch.Tensor, tokens: torch.Tensor,
+                          lens_t: int, keep_ratio: float,
+                          global_index: torch.Tensor,
+                          box_mask_z: torch.Tensor):
+    """
+    Eliminate potential background candidates for computation reduction and noise cancellation.
+    Args:
+        attn (torch.Tensor): [B, num_heads, L_t + L_s, L_t + L_s], attention weights
+        tokens (torch.Tensor):  [B, L_t + L_s, C], template and search region tokens
+        lens_t (int): length of template
+        keep_ratio (float): keep ratio of search region tokens (candidates)
+        global_index (torch.Tensor): global index of search region tokens
+        box_mask_z (torch.Tensor): template mask used to accumulate attention weights
+
+    Returns:
+        tokens_new (torch.Tensor): tokens after candidate elimination
+        keep_index (torch.Tensor): indices of kept search region tokens
+        removed_index (torch.Tensor): indices of removed search region tokens
+    """
+    lens_s = attn.shape[-1] - lens_t
+    bs, hn, _, _ = attn.shape
+
+    lens_keep = math.ceil(keep_ratio * lens_s)
+    if lens_keep == lens_s:
+        return tokens, global_index, None
+
+    attn_t = attn[:, :, :lens_t, lens_t:]
+
+    if box_mask_z is not None:
+        box_mask_z = box_mask_z.unsqueeze(1).unsqueeze(-1).expand(
+            -1, attn_t.shape[1], -1, attn_t.shape[-1])
+        attn_t = attn_t[box_mask_z]
+        attn_t = attn_t.view(bs, hn, -1, lens_s)
+        attn_t = attn_t.mean(dim=2).mean(dim=1)  # B, H, L-T, L_s --> B, L_s
+    else:
+        attn_t = attn_t.mean(dim=2).mean(dim=1)  # B, H, L-T, L_s --> B, L_s
+
+    # use sort instead of topk, due to the speed issue
+    # https://github.com/pytorch/pytorch/issues/22812
+    sorted_attn, indices = torch.sort(attn_t, dim=1, descending=True)
+
+    _, topk_idx = sorted_attn[:, :lens_keep], indices[:, :lens_keep]
+    _, non_topk_idx = sorted_attn[:, lens_keep:], indices[:, lens_keep:]
+    keep_index = global_index.gather(dim=1, index=topk_idx)
+    removed_index = global_index.gather(dim=1, index=non_topk_idx)
+
+    # separate template and search tokens
+    tokens_t = tokens[:, :lens_t]
+    tokens_s = tokens[:, lens_t:]
+
+    # obtain the attentive and inattentive tokens
+    B, L, C = tokens_s.shape
+    attentive_tokens = tokens_s.gather(
+        dim=1, index=topk_idx.unsqueeze(-1).expand(B, -1, C))
+
+    # concatenate these tokens
+    tokens_new = torch.cat([tokens_t, attentive_tokens], dim=1)
+
+    return tokens_new, keep_index, removed_index
+
+
+class CEBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        drop=0.,
+        attn_drop=0.,
+        drop_path=0.,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        keep_ratio_search=1.0,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        self.keep_ratio_search = keep_ratio_search
+
+    def forward(self,
+                x,
+                global_index_template,
+                global_index_search,
+                mask=None,
+                ce_template_mask=None,
+                keep_ratio_search=None):
+        x_attn, attn = self.attn(self.norm1(x), mask, True)
+        x = x + self.drop_path(x_attn)
+        lens_t = global_index_template.shape[1]
+
+        removed_index_search = None
+        if self.keep_ratio_search < 1 and (keep_ratio_search is None
+                                           or keep_ratio_search < 1):
+            keep_ratio_search = self.keep_ratio_search if keep_ratio_search is None else keep_ratio_search
+            x, global_index_search, removed_index_search = candidate_elimination(
+                attn, x, lens_t, keep_ratio_search, global_index_search,
+                ce_template_mask)
+
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x, global_index_template, global_index_search, removed_index_search, attn
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
new file mode 100644
index 00000000..e64b68d7
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
@@ -0,0 +1,141 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch
+import torch.nn as nn
+
+
+def conv(in_planes,
+         out_planes,
+         kernel_size=3,
+         stride=1,
+         padding=1,
+         dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=True), nn.BatchNorm2d(out_planes), nn.ReLU(inplace=True))
+
+
+class CenterPredictor(
+        nn.Module, ):
+
+    def __init__(self, inplanes=64, channel=256, feat_sz=20, stride=16):
+        super(CenterPredictor, self).__init__()
+        self.feat_sz = feat_sz
+        self.stride = stride
+        self.img_sz = self.feat_sz * self.stride
+
+        # corner predict
+        self.conv1_ctr = conv(inplanes, channel)
+        self.conv2_ctr = conv(channel, channel // 2)
+        self.conv3_ctr = conv(channel // 2, channel // 4)
+        self.conv4_ctr = conv(channel // 4, channel // 8)
+        self.conv5_ctr = nn.Conv2d(channel // 8, 1, kernel_size=1)
+
+        # offset regress
+        self.conv1_offset = conv(inplanes, channel)
+        self.conv2_offset = conv(channel, channel // 2)
+        self.conv3_offset = conv(channel // 2, channel // 4)
+        self.conv4_offset = conv(channel // 4, channel // 8)
+        self.conv5_offset = nn.Conv2d(channel // 8, 2, kernel_size=1)
+
+        # size regress
+        self.conv1_size = conv(inplanes, channel)
+        self.conv2_size = conv(channel, channel // 2)
+        self.conv3_size = conv(channel // 2, channel // 4)
+        self.conv4_size = conv(channel // 4, channel // 8)
+        self.conv5_size = nn.Conv2d(channel // 8, 2, kernel_size=1)
+
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, x, gt_score_map=None):
+        """ Forward pass with input x. """
+        score_map_ctr, size_map, offset_map = self.get_score_map(x)
+
+        # assert gt_score_map is None
+        if gt_score_map is None:
+            bbox = self.cal_bbox(score_map_ctr, size_map, offset_map)
+        else:
+            bbox = self.cal_bbox(
+                gt_score_map.unsqueeze(1), size_map, offset_map)
+
+        return score_map_ctr, bbox, size_map, offset_map
+
+    def cal_bbox(self,
+                 score_map_ctr,
+                 size_map,
+                 offset_map,
+                 return_score=False):
+        max_score, idx = torch.max(
+            score_map_ctr.flatten(1), dim=1, keepdim=True)
+        idx_y = idx // self.feat_sz
+        idx_x = idx % self.feat_sz
+
+        idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1)
+        size = size_map.flatten(2).gather(dim=2, index=idx)
+        offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1)
+
+        # cx, cy, w, h
+        bbox = torch.cat(
+            [(idx_x.to(torch.float) + offset[:, :1]) / self.feat_sz,
+             (idx_y.to(torch.float) + offset[:, 1:]) / self.feat_sz,
+             size.squeeze(-1)],
+            dim=1)
+
+        if return_score:
+            return bbox, max_score
+        return bbox
+
+    def get_score_map(self, x):
+
+        def _sigmoid(x):
+            y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4)
+            return y
+
+        # ctr branch
+        x_ctr1 = self.conv1_ctr(x)
+        x_ctr2 = self.conv2_ctr(x_ctr1)
+        x_ctr3 = self.conv3_ctr(x_ctr2)
+        x_ctr4 = self.conv4_ctr(x_ctr3)
+        score_map_ctr = self.conv5_ctr(x_ctr4)
+
+        # offset branch
+        x_offset1 = self.conv1_offset(x)
+        x_offset2 = self.conv2_offset(x_offset1)
+        x_offset3 = self.conv3_offset(x_offset2)
+        x_offset4 = self.conv4_offset(x_offset3)
+        score_map_offset = self.conv5_offset(x_offset4)
+
+        # size branch
+        x_size1 = self.conv1_size(x)
+        x_size2 = self.conv2_size(x_size1)
+        x_size3 = self.conv3_size(x_size2)
+        x_size4 = self.conv4_size(x_size3)
+        score_map_size = self.conv5_size(x_size4)
+        return _sigmoid(score_map_ctr), _sigmoid(
+            score_map_size), score_map_offset
+
+
+def build_box_head(cfg, hidden_dim):
+    stride = cfg.MODEL.BACKBONE.STRIDE
+
+    if cfg.MODEL.HEAD.TYPE == 'CENTER':
+        in_channel = hidden_dim
+        out_channel = cfg.MODEL.HEAD.NUM_CHANNELS
+        feat_sz = int(cfg.DATA.SEARCH.SIZE / stride)
+        center_head = CenterPredictor(
+            inplanes=in_channel,
+            channel=out_channel,
+            feat_sz=feat_sz,
+            stride=stride)
+        return center_head
+    else:
+        raise ValueError('HEAD TYPE %s is not supported.'
+                         % cfg.MODEL.HEAD_TYPE)
diff --git a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
new file mode 100644
index 00000000..0e623505
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
@@ -0,0 +1,37 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch.nn as nn
+from timm.models.layers import to_2tuple
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 norm_layer=None,
+                 flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/__init__.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
new file mode 100644
index 00000000..e2d2f80f
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
@@ -0,0 +1,93 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch.nn as nn
+from timm.models.layers import to_2tuple
+
+from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \
+    PatchEmbed
+
+
+class BaseBackbone(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+        # for original ViT
+        self.pos_embed = None
+        self.img_size = [224, 224]
+        self.patch_size = 16
+        self.embed_dim = 384
+
+        self.cat_mode = 'direct'
+
+        self.pos_embed_z = None
+        self.pos_embed_x = None
+
+        self.template_segment_pos_embed = None
+        self.search_segment_pos_embed = None
+
+        self.return_stage = [2, 5, 8, 11]
+
+    def finetune_track(self, cfg, patch_start_index=1):
+
+        search_size = to_2tuple(cfg.DATA.SEARCH.SIZE)
+        template_size = to_2tuple(cfg.DATA.TEMPLATE.SIZE)
+        new_patch_size = cfg.MODEL.BACKBONE.STRIDE
+
+        self.cat_mode = cfg.MODEL.BACKBONE.CAT_MODE
+
+        # resize patch embedding
+        if new_patch_size != self.patch_size:
+            print(
+                'Inconsistent Patch Size With The Pretrained Weights, Interpolate The Weight!'
+            )
+            old_patch_embed = {}
+            for name, param in self.patch_embed.named_parameters():
+                if 'weight' in name:
+                    param = nn.functional.interpolate(
+                        param,
+                        size=(new_patch_size, new_patch_size),
+                        mode='bicubic',
+                        align_corners=False)
+                    param = nn.Parameter(param)
+                old_patch_embed[name] = param
+            self.patch_embed = PatchEmbed(
+                img_size=self.img_size,
+                patch_size=new_patch_size,
+                in_chans=3,
+                embed_dim=self.embed_dim)
+            self.patch_embed.proj.bias = old_patch_embed['proj.bias']
+            self.patch_embed.proj.weight = old_patch_embed['proj.weight']
+
+        # for patch embedding
+        patch_pos_embed = self.pos_embed[:, patch_start_index:, :]
+        patch_pos_embed = patch_pos_embed.transpose(1, 2)
+        B, E, Q = patch_pos_embed.shape
+        P_H, P_W = self.img_size[0] // self.patch_size, self.img_size[
+            1] // self.patch_size
+        patch_pos_embed = patch_pos_embed.view(B, E, P_H, P_W)
+
+        # for search region
+        H, W = search_size
+        new_P_H, new_P_W = H // new_patch_size, W // new_patch_size
+        search_patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_P_H, new_P_W),
+            mode='bicubic',
+            align_corners=False)
+        search_patch_pos_embed = search_patch_pos_embed.flatten(2).transpose(
+            1, 2)
+
+        # for template region
+        H, W = template_size
+        new_P_H, new_P_W = H // new_patch_size, W // new_patch_size
+        template_patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_P_H, new_P_W),
+            mode='bicubic',
+            align_corners=False)
+        template_patch_pos_embed = template_patch_pos_embed.flatten(
+            2).transpose(1, 2)
+
+        self.pos_embed_z = nn.Parameter(template_patch_pos_embed)
+        self.pos_embed_x = nn.Parameter(search_patch_pos_embed)
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
new file mode 100644
index 00000000..977e936d
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
@@ -0,0 +1,109 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch
+from torch import nn
+
+from modelscope.models.cv.video_single_object_tracking.models.layers.head import \
+    build_box_head
+from .vit_ce import vit_base_patch16_224_ce
+
+
+class OSTrack(nn.Module):
+    """ This is the base class for OSTrack """
+
+    def __init__(self,
+                 transformer,
+                 box_head,
+                 aux_loss=False,
+                 head_type='CORNER'):
+        """ Initializes the model.
+        Parameters:
+            transformer: torch module of the transformer architecture.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.backbone = transformer
+        self.box_head = box_head
+
+        self.aux_loss = aux_loss
+        self.head_type = head_type
+        if head_type == 'CORNER' or head_type == 'CENTER':
+            self.feat_sz_s = int(box_head.feat_sz)
+            self.feat_len_s = int(box_head.feat_sz**2)
+
+    def forward(
+        self,
+        template: torch.Tensor,
+        search: torch.Tensor,
+        ce_template_mask=None,
+        ce_keep_rate=None,
+    ):
+        x, aux_dict = self.backbone(
+            z=template,
+            x=search,
+            ce_template_mask=ce_template_mask,
+            ce_keep_rate=ce_keep_rate,
+        )
+
+        # Forward head
+        feat_last = x
+        if isinstance(x, list):
+            feat_last = x[-1]
+        out = self.forward_head(feat_last, None)
+
+        out.update(aux_dict)
+        out['backbone_feat'] = x
+        return out
+
+    def forward_head(self, cat_feature, gt_score_map=None):
+        """
+        cat_feature: output embeddings of the backbone, it can be (HW1+HW2, B, C) or (HW2, B, C)
+        """
+        enc_opt = cat_feature[:, -self.
+                              feat_len_s:]  # encoder output for the search region (B, HW, C)
+        opt = (enc_opt.unsqueeze(-1)).permute((0, 3, 2, 1)).contiguous()
+        bs, Nq, C, HW = opt.size()
+        opt_feat = opt.view(-1, C, self.feat_sz_s, self.feat_sz_s)
+
+        if self.head_type == 'CENTER':
+            # run the center head
+            score_map_ctr, bbox, size_map, offset_map = self.box_head(
+                opt_feat, gt_score_map)
+            outputs_coord = bbox
+            outputs_coord_new = outputs_coord.view(bs, Nq, 4)
+            out = {
+                'pred_boxes': outputs_coord_new,
+                'score_map': score_map_ctr,
+                'size_map': size_map,
+                'offset_map': offset_map
+            }
+            return out
+        else:
+            raise NotImplementedError
+
+
+def build_ostrack(cfg):
+    if cfg.MODEL.BACKBONE.TYPE == 'vit_base_patch16_224_ce':
+        backbone = vit_base_patch16_224_ce(
+            False,
+            drop_path_rate=cfg.MODEL.BACKBONE.DROP_PATH_RATE,
+            ce_loc=cfg.MODEL.BACKBONE.CE_LOC,
+            ce_keep_ratio=cfg.MODEL.BACKBONE.CE_KEEP_RATIO,
+        )
+        hidden_dim = backbone.embed_dim
+        patch_start_index = 1
+    else:
+        raise NotImplementedError
+
+    backbone.finetune_track(cfg=cfg, patch_start_index=patch_start_index)
+
+    box_head = build_box_head(cfg, hidden_dim)
+
+    model = OSTrack(
+        backbone,
+        box_head,
+        aux_loss=False,
+        head_type=cfg.MODEL.HEAD.TYPE,
+    )
+
+    return model
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
new file mode 100644
index 00000000..a49fa50c
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
@@ -0,0 +1,24 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch
+
+
+def combine_tokens(template_tokens,
+                   search_tokens,
+                   mode='direct',
+                   return_res=False):
+    if mode == 'direct':
+        merged_feature = torch.cat((template_tokens, search_tokens), dim=1)
+    else:
+        raise NotImplementedError
+
+    return merged_feature
+
+
+def recover_tokens(merged_tokens, mode='direct'):
+    if mode == 'direct':
+        recovered_tokens = merged_tokens
+    else:
+        raise NotImplementedError
+
+    return recovered_tokens
diff --git a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
new file mode 100644
index 00000000..cd393109
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
@@ -0,0 +1,343 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+from functools import partial
+
+import torch
+import torch.nn as nn
+from timm.models.layers import DropPath, Mlp, to_2tuple
+
+from modelscope.models.cv.video_single_object_tracking.models.layers.attn_blocks import \
+    CEBlock
+from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \
+    PatchEmbed
+from .base_backbone import BaseBackbone
+from .utils import combine_tokens, recover_tokens
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+
+class VisionTransformer(BaseBackbone):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 distilled=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 embed_layer=PatchEmbed,
+                 norm_layer=None,
+                 act_layer=None):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = None
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer) for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+
+
+class VisionTransformerCE(VisionTransformer):
+    """ Vision Transformer with candidate elimination (CE) module
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+
+    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
+        - https://arxiv.org/abs/2012.12877
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 distilled=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 embed_layer=PatchEmbed,
+                 norm_layer=None,
+                 act_layer=None,
+                 ce_loc=None,
+                 ce_keep_ratio=None):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            distilled (bool): model includes a distillation token and head as in DeiT models
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        if isinstance(img_size, tuple):
+            self.img_size = img_size
+        else:
+            self.img_size = to_2tuple(img_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 2 if distilled else 1
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.dist_token = nn.Parameter(torch.zeros(
+            1, 1, embed_dim)) if distilled else None
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        blocks = []
+        ce_index = 0
+        self.ce_loc = ce_loc
+        for i in range(depth):
+            ce_keep_ratio_i = 1.0
+            if ce_loc is not None and i in ce_loc:
+                ce_keep_ratio_i = ce_keep_ratio[ce_index]
+                ce_index += 1
+
+            blocks.append(
+                CEBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    keep_ratio_search=ce_keep_ratio_i))
+
+        self.blocks = nn.Sequential(*blocks)
+        self.norm = norm_layer(embed_dim)
+
+    def forward_features(
+        self,
+        z,
+        x,
+        mask_x=None,
+        ce_template_mask=None,
+        ce_keep_rate=None,
+    ):
+        B = x.shape[0]
+
+        x = self.patch_embed(x)
+        z = self.patch_embed(z)
+
+        z += self.pos_embed_z
+        x += self.pos_embed_x
+
+        x = combine_tokens(z, x, mode=self.cat_mode)
+
+        x = self.pos_drop(x)
+
+        lens_z = self.pos_embed_z.shape[1]
+        lens_x = self.pos_embed_x.shape[1]
+
+        global_index_t = torch.linspace(0, lens_z - 1, lens_z).to(x.device)
+        global_index_t = global_index_t.repeat(B, 1)
+
+        global_index_s = torch.linspace(0, lens_x - 1, lens_x).to(x.device)
+        global_index_s = global_index_s.repeat(B, 1)
+        removed_indexes_s = []
+        for i, blk in enumerate(self.blocks):
+            x, global_index_t, global_index_s, removed_index_s, attn = \
+                blk(x, global_index_t, global_index_s, mask_x, ce_template_mask, ce_keep_rate)
+
+            if self.ce_loc is not None and i in self.ce_loc:
+                removed_indexes_s.append(removed_index_s)
+
+        x = self.norm(x)
+        lens_x_new = global_index_s.shape[1]
+        lens_z_new = global_index_t.shape[1]
+
+        z = x[:, :lens_z_new]
+        x = x[:, lens_z_new:]
+
+        if removed_indexes_s and removed_indexes_s[0] is not None:
+            removed_indexes_cat = torch.cat(removed_indexes_s, dim=1)
+
+            pruned_lens_x = lens_x - lens_x_new
+            pad_x = torch.zeros([B, pruned_lens_x, x.shape[2]],
+                                device=x.device)
+            x = torch.cat([x, pad_x], dim=1)
+            index_all = torch.cat([global_index_s, removed_indexes_cat], dim=1)
+            # recover original token order
+            C = x.shape[-1]
+            x = torch.zeros_like(x).scatter_(
+                dim=1,
+                index=index_all.unsqueeze(-1).expand(B, -1, C).to(torch.int64),
+                src=x)
+
+        x = recover_tokens(x, mode=self.cat_mode)
+
+        # re-concatenate with the template, which may be further used by other modules
+        x = torch.cat([z, x], dim=1)
+
+        aux_dict = {
+            'attn': attn,
+            'removed_indexes_s': removed_indexes_s,  # used for visualization
+        }
+
+        return x, aux_dict
+
+    def forward(self, z, x, ce_template_mask=None, ce_keep_rate=None):
+
+        x, aux_dict = self.forward_features(
+            z,
+            x,
+            ce_template_mask=ce_template_mask,
+            ce_keep_rate=ce_keep_rate,
+        )
+
+        return x, aux_dict
+
+
+def _create_vision_transformer(pretrained=False, **kwargs):
+    model = VisionTransformerCE(**kwargs)
+    return model
+
+
+def vit_base_patch16_224_ce(pretrained=False, **kwargs):
+    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    """
+    model_kwargs = dict(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer(pretrained=pretrained, **model_kwargs)
+    return model
diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/__init__.py b/modelscope/models/cv/video_single_object_tracking/tracker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
new file mode 100644
index 00000000..3eff252a
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
@@ -0,0 +1,139 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import torch
+
+from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
+    cfg
+from modelscope.models.cv.video_single_object_tracking.models.ostrack.ostrack import \
+    build_ostrack
+from modelscope.models.cv.video_single_object_tracking.utils.utils import (
+    Preprocessor, clip_box, generate_mask_cond, hann2d, sample_target,
+    transform_image_to_crop)
+
+
+class OSTrack():
+
+    def __init__(self, ckpt_path, device):
+        network = build_ostrack(cfg)
+        network.load_state_dict(
+            torch.load(ckpt_path, map_location='cpu')['net'], strict=True)
+        self.cfg = cfg
+        if device.type == 'cuda':
+            self.network = network.to(device)
+        else:
+            self.network = network
+        self.network.eval()
+        self.preprocessor = Preprocessor(device)
+        self.state = None
+
+        self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE
+        # motion constrain
+        if device.type == 'cuda':
+            self.output_window = hann2d(
+                torch.tensor([self.feat_sz, self.feat_sz]).long(),
+                centered=True).to(device)
+        else:
+            self.output_window = hann2d(
+                torch.tensor([self.feat_sz, self.feat_sz]).long(),
+                centered=True)
+        self.frame_id = 0
+        # for save boxes from all queries
+        self.z_dict1 = {}
+
+    def initialize(self, image, info: dict):
+        # forward the template once
+        z_patch_arr, resize_factor, z_amask_arr = sample_target(
+            image,
+            info['init_bbox'],
+            self.cfg.TEST.TEMPLATE_FACTOR,
+            output_sz=self.cfg.TEST.TEMPLATE_SIZE)
+        self.z_patch_arr = z_patch_arr
+        template = self.preprocessor.process(z_patch_arr, z_amask_arr)
+        with torch.no_grad():
+            self.z_dict1 = template
+
+        self.box_mask_z = None
+        if self.cfg.MODEL.BACKBONE.CE_LOC:
+            template_bbox = self.transform_bbox_to_crop(
+                info['init_bbox'], resize_factor,
+                template.tensors.device).squeeze(1)
+            self.box_mask_z = generate_mask_cond(self.cfg, 1,
+                                                 template.tensors.device,
+                                                 template_bbox)
+
+        # save states
+        self.state = info['init_bbox']
+        self.frame_id = 0
+
+    def track(self, image, info: dict = None):
+        H, W, _ = image.shape
+        self.frame_id += 1
+        x_patch_arr, resize_factor, x_amask_arr = sample_target(
+            image,
+            self.state,
+            self.cfg.TEST.SEARCH_FACTOR,
+            output_sz=self.cfg.TEST.SEARCH_SIZE)  # (x1, y1, w, h)
+        search = self.preprocessor.process(x_patch_arr, x_amask_arr)
+
+        with torch.no_grad():
+            x_dict = search
+            # merge the template and the search
+            # run the transformer
+            out_dict = self.network.forward(
+                template=self.z_dict1.tensors,
+                search=x_dict.tensors,
+                ce_template_mask=self.box_mask_z)
+
+        # add hann windows
+        pred_score_map = out_dict['score_map']
+        response = self.output_window * pred_score_map
+        pred_boxes = self.network.box_head.cal_bbox(response,
+                                                    out_dict['size_map'],
+                                                    out_dict['offset_map'])
+        pred_boxes = pred_boxes.view(-1, 4)
+        # Baseline: Take the mean of all pred boxes as the final result
+        pred_box = (pred_boxes.mean(dim=0) * self.cfg.TEST.SEARCH_SIZE
+                    / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
+        # get the final box result
+        self.state = clip_box(
+            self.map_box_back(pred_box, resize_factor), H, W, margin=10)
+
+        x1, y1, w, h = self.state
+        x2 = x1 + w
+        y2 = y1 + h
+        return {'target_bbox': [x1, y1, x2, y2]}
+
+    def map_box_back(self, pred_box: list, resize_factor: float):
+        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[
+            1] + 0.5 * self.state[3]
+        cx, cy, w, h = pred_box
+        half_side = 0.5 * self.cfg.TEST.SEARCH_SIZE / resize_factor
+        cx_real = cx + (cx_prev - half_side)
+        cy_real = cy + (cy_prev - half_side)
+        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]
+
+    def transform_bbox_to_crop(self,
+                               box_in,
+                               resize_factor,
+                               device,
+                               box_extract=None,
+                               crop_type='template'):
+        if crop_type == 'template':
+            crop_sz = torch.Tensor(
+                [self.cfg.TEST.TEMPLATE_SIZE, self.cfg.TEST.TEMPLATE_SIZE])
+        elif crop_type == 'search':
+            crop_sz = torch.Tensor(
+                [self.cfg.TEST.SEARCH_SIZE, self.cfg.TEST.SEARCH_SIZE])
+        else:
+            raise NotImplementedError
+
+        box_in = torch.tensor(box_in)
+        if box_extract is None:
+            box_extract = box_in
+        else:
+            box_extract = torch.tensor(box_extract)
+        template_bbox = transform_image_to_crop(
+            box_in, box_extract, resize_factor, crop_sz, normalize=True)
+        template_bbox = template_bbox.view(1, 1, 4).to(device)
+
+        return template_bbox
diff --git a/modelscope/models/cv/video_single_object_tracking/utils/__init__.py b/modelscope/models/cv/video_single_object_tracking/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_single_object_tracking/utils/utils.py b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
new file mode 100644
index 00000000..505b2aa9
--- /dev/null
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -0,0 +1,261 @@
+# The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
+# https://github.com/botaoye/OSTrack/
+import math
+from typing import Optional
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def hann1d(sz: int, centered=True) -> torch.Tensor:
+    """1D cosine window."""
+    if centered:
+        return 0.5 * (1 - torch.cos(
+            (2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
+    w = 0.5 * (1 + torch.cos(
+        (2 * math.pi / (sz + 2)) * torch.arange(0, sz // 2 + 1).float()))
+    return torch.cat([w, w[1:sz - sz // 2].flip((0, ))])
+
+
+def hann2d(sz: torch.Tensor, centered=True) -> torch.Tensor:
+    """2D cosine window."""
+    return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(
+        sz[1].item(), centered).reshape(1, 1, 1, -1)
+
+
+class NestedTensor(object):
+
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+
+class Preprocessor(object):
+
+    def __init__(self, device: str):
+        self.device = device
+        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1))
+        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1))
+        if 'cuda' == self.device.type:
+            self.mean = self.mean.to(self.device)
+            self.std = self.std.to(self.device)
+
+    def process(self, img_arr: np.ndarray, amask_arr: np.ndarray):
+        # Deal with the image patch
+        if 'cuda' == self.device.type:
+            img_tensor = torch.tensor(img_arr).to(self.device).float().permute(
+                (2, 0, 1)).unsqueeze(dim=0)
+        else:
+            img_tensor = torch.tensor(img_arr).float().permute(
+                (2, 0, 1)).unsqueeze(dim=0)
+        img_tensor_norm = (
+            (img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)
+
+        # Deal with the attention mask
+        if 'cuda' == self.device.type:
+            amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).to(
+                self.device).unsqueeze(dim=0)  # (1,H,W)
+        else:
+            amask_tensor = torch.from_numpy(amask_arr).to(
+                torch.bool).unsqueeze(dim=0)  # (1,H,W)
+        return NestedTensor(img_tensor_norm, amask_tensor)
+
+
+def clip_box(box: list, H, W, margin=0):
+    x1, y1, w, h = box
+    x2, y2 = x1 + w, y1 + h
+    x1 = min(max(0, x1), W - margin)
+    x2 = min(max(margin, x2), W)
+    y1 = min(max(0, y1), H - margin)
+    y2 = min(max(margin, y2), H)
+    w = max(margin, x2 - x1)
+    h = max(margin, y2 - y1)
+    if isinstance(x1, torch.Tensor):
+        x1 = x1.item()
+        y1 = y1.item()
+        w = w.item()
+        h = h.item()
+    return [x1, y1, w, h]
+
+
+def generate_mask_cond(cfg, bs, device, gt_bbox):
+    template_size = cfg.DATA.TEMPLATE.SIZE
+    stride = cfg.MODEL.BACKBONE.STRIDE
+    template_feat_size = template_size // stride
+
+    if cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE == 'CTR_POINT':
+        if template_feat_size == 8:
+            index = slice(3, 4)
+        elif template_feat_size == 12:
+            index = slice(5, 6)
+        elif template_feat_size == 7:
+            index = slice(3, 4)
+        elif template_feat_size == 14:
+            index = slice(6, 7)
+        else:
+            raise NotImplementedError
+        box_mask_z = torch.zeros([bs, template_feat_size, template_feat_size],
+                                 device=device)
+        box_mask_z[:, index, index] = 1
+        box_mask_z = box_mask_z.flatten(1).to(torch.bool)
+    else:
+        raise NotImplementedError
+
+    return box_mask_z
+
+
+def sample_target(im,
+                  target_bb,
+                  search_area_factor,
+                  output_sz=None,
+                  mask=None):
+    """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area
+
+    args:
+        im - cv image
+        target_bb - target box [x, y, w, h]
+        search_area_factor - Ratio of crop size to target size
+        output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.
+
+    returns:
+        cv image - extracted crop
+        float - the factor by which the crop has been resized to make the crop size equal output_size
+    """
+    if not isinstance(target_bb, list):
+        x, y, w, h = target_bb.tolist()
+    else:
+        x, y, w, h = target_bb
+    # Crop image
+    crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)
+
+    if crop_sz < 1:
+        raise Exception('Too small bounding box.')
+
+    x1 = round(x + 0.5 * w - crop_sz * 0.5)
+    x2 = x1 + crop_sz
+
+    y1 = round(y + 0.5 * h - crop_sz * 0.5)
+    y2 = y1 + crop_sz
+
+    x1_pad = max(0, -x1)
+    x2_pad = max(x2 - im.shape[1] + 1, 0)
+
+    y1_pad = max(0, -y1)
+    y2_pad = max(y2 - im.shape[0] + 1, 0)
+
+    # Crop target
+    im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
+    if mask is not None:
+        mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad]
+
+    # Pad
+    im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad,
+                                        x2_pad, cv2.BORDER_CONSTANT)
+    # deal with attention mask
+    H, W, _ = im_crop_padded.shape
+    att_mask = np.ones((H, W))
+    end_x, end_y = -x2_pad, -y2_pad
+    if y2_pad == 0:
+        end_y = None
+    if x2_pad == 0:
+        end_x = None
+    att_mask[y1_pad:end_y, x1_pad:end_x] = 0
+    if mask is not None:
+        mask_crop_padded = F.pad(
+            mask_crop,
+            pad=(x1_pad, x2_pad, y1_pad, y2_pad),
+            mode='constant',
+            value=0)
+
+    if output_sz is not None:
+        resize_factor = output_sz / crop_sz
+        im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))
+        att_mask = cv2.resize(att_mask,
+                              (output_sz, output_sz)).astype(np.bool_)
+        if mask is None:
+            return im_crop_padded, resize_factor, att_mask
+        mask_crop_padded = \
+            F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz),
+                          mode='bilinear', align_corners=False)[0, 0]
+        return im_crop_padded, resize_factor, att_mask, mask_crop_padded
+
+    else:
+        if mask is None:
+            return im_crop_padded, att_mask.astype(np.bool_), 1.0
+        return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded
+
+
+def transform_image_to_crop(box_in: torch.Tensor,
+                            box_extract: torch.Tensor,
+                            resize_factor: float,
+                            crop_sz: torch.Tensor,
+                            normalize=False) -> torch.Tensor:
+    """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image
+    args:
+        box_in - the box for which the co-ordinates are to be transformed
+        box_extract - the box about which the image crop has been extracted.
+        resize_factor - the ratio between the original image scale and the scale of the image crop
+        crop_sz - size of the cropped image
+
+    returns:
+        torch.Tensor - transformed co-ordinates of box_in
+    """
+    box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]
+
+    box_in_center = box_in[0:2] + 0.5 * box_in[2:4]
+
+    box_out_center = (crop_sz - 1) / 2 + (box_in_center
+                                          - box_extract_center) * resize_factor
+    box_out_wh = box_in[2:4] * resize_factor
+
+    box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
+    if normalize:
+        return box_out / crop_sz[0]
+    else:
+        return box_out
+
+
+def check_box(box: list, image_height, image_width) -> bool:
+    """ To check whether the box is within the image range or not
+    args:
+        box - the bounding box in the form of [x1, y1, x2, y2]
+        image_height - the height of the image
+        image_width - the width of the image
+
+    returns:
+        bool - if box is valid, return True. Otherwise, return False
+    """
+    assert len(box) == 4, 'box must be in the form of: [x1, y1, x2, y2]'
+    if box[0] < 0 or box[0] >= image_width:
+        return False
+    if box[2] < 0 or box[2] >= image_width:
+        return False
+    if box[1] < 0 or box[1] >= image_height:
+        return False
+    if box[3] < 0 or box[3] >= image_height:
+        return False
+    return True
+
+
+def show_tracking_result(video_in_path, bboxes, video_save_path):
+    cap = cv2.VideoCapture(video_in_path)
+    for i in range(len(bboxes)):
+        box = bboxes[i]
+        success, frame = cap.read()
+        if success is False:
+            raise Exception(video_in_path,
+                            ' can not be correctly decoded by OpenCV.')
+        if i == 0:
+            size = (frame.shape[1], frame.shape[0])
+            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+            video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                           cap.get(cv2.CAP_PROP_FPS), size,
+                                           True)
+        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0),
+                      5)
+        video_writer.write(frame)
+    video_writer.release
+    cap.release()
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 9a0636ee..112b3a58 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -9,9 +9,10 @@ if TYPE_CHECKING:
     from .gemm import GEMMForMultiModalEmbedding
     from .diffusion import DiffusionForTextToImageSynthesis
     from .mmr import VideoCLIPForMultiModalEmbedding
-    from .mplug_for_visual_question_answering import \
-        MPlugForVisualQuestionAnswering
+    from .mplug_for_all_tasks import MPlugForAllTasks
     from .ofa_for_all_tasks import OfaForAllTasks
+    from .ofa_for_text_to_image_synthesis_model import \
+        OfaForTextToImageSynthesis
 
 else:
     _import_structure = {
@@ -19,8 +20,7 @@ else:
         'diffusion': ['DiffusionForTextToImageSynthesis'],
         'gemm': ['GEMMForMultiModalEmbedding'],
         'mmr': ['VideoCLIPForMultiModalEmbedding'],
-        'mplug_for_visual_question_answering':
-        ['MPlugForVisualQuestionAnswering'],
+        'mplug_for_all_tasks': ['MPlugForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
         'ofa_for_text_to_image_synthesis_model':
         ['OfaForTextToImageSynthesis']
diff --git a/modelscope/models/multi_modal/clip/__init__.py b/modelscope/models/multi_modal/clip/__init__.py
index bb2fb3b2..3fd492b9 100644
--- a/modelscope/models/multi_modal/clip/__init__.py
+++ b/modelscope/models/multi_modal/clip/__init__.py
@@ -1 +1 @@
-from .clip_model import CLIPForMultiModalEmbedding
+from .model import CLIPForMultiModalEmbedding
diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py
new file mode 100644
index 00000000..8d356f42
--- /dev/null
+++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py
@@ -0,0 +1,422 @@
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function
+import collections
+import os
+import re
+import unicodedata
+
+import six
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+    """Checks whether the casing config is consistent with the checkpoint name."""
+
+    # The casing has to be passed in by the user and there is no explicit check
+    # as to whether it matches the checkpoint. The casing information probably
+    # should have been stored in the bert_config.json file, but it's not, so
+    # we have to heuristically detect it to validate.
+
+    if not init_checkpoint:
+        return
+
+    m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
+    if m is None:
+        return
+
+    model_name = m.group(1)
+
+    lower_models = [
+        'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
+        'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
+    ]
+
+    cased_models = [
+        'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
+        'multi_cased_L-12_H-768_A-12'
+    ]
+
+    is_bad_config = False
+    if model_name in lower_models and not do_lower_case:
+        is_bad_config = True
+        actual_flag = 'False'
+        case_name = 'lowercased'
+        opposite_flag = 'True'
+
+    if model_name in cased_models and do_lower_case:
+        is_bad_config = True
+        actual_flag = 'True'
+        case_name = 'cased'
+        opposite_flag = 'False'
+
+    if is_bad_config:
+        raise ValueError(
+            'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
+            'However, `%s` seems to be a %s model, so you '
+            'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
+            'how the model was pre-training. If this error is wrong, please '
+            'just comment out this check.' %
+            (actual_flag, init_checkpoint, model_name, case_name,
+             opposite_flag))
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode('utf-8', 'ignore')
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode('utf-8', 'ignore')
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    else:
+        raise ValueError('Not running on Python2 or Python 3?')
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode('utf-8', 'ignore')
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode('utf-8')
+        else:
+            raise ValueError('Unsupported string type: %s' % (type(text)))
+    else:
+        raise ValueError('Not running on Python2 or Python 3?')
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, 'r') as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+    @staticmethod
+    def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of tokens (string) in a single string. """
+
+        def clean_up_tokenization(out_string):
+            """ Clean up a list of simple English tokenization artifacts
+            like spaces before punctuations and abreviated forms.
+            """
+            out_string = (
+                out_string.replace(' .', '.').replace(' ?', '?').replace(
+                    ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
+                        " n't", "n't").replace(" 'm", "'m").replace(
+                            " 's", "'s").replace(" 've",
+                                                 "'ve").replace(" 're", "'re"))
+            return out_string
+
+        text = ' '.join(tokens).replace(' ##', '').strip()
+        if clean_up_tokenization_spaces:
+            clean_text = clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def vocab_size(self):
+        return len(self.vocab)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(' '.join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize('NFD', text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == 'Mn':
+                continue
+            output.append(char)
+        return ''.join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return [''.join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(' ')
+                output.append(char)
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
+                or (cp >= 0x20000 and cp <= 0x2A6DF)
+                or (cp >= 0x2A700 and cp <= 0x2B73F)
+                or (cp >= 0x2B740 and cp <= 0x2B81F)
+                or (cp >= 0x2B820 and cp <= 0x2CEAF)
+                or (cp >= 0xF900 and cp <= 0xFAFF)
+                or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(' ')
+            else:
+                output.append(char)
+        return ''.join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = ''.join(chars[start:end])
+                    if start > 0:
+                        substr = '##' + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
+        return True
+    cat = unicodedata.category(char)
+    if cat == 'Zs':
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == '\t' or char == '\n' or char == '\r':
+        return False
+    cat = unicodedata.category(char)
+    if cat in ('Cc', 'Cf'):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
+            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith('P'):
+        return True
+    return False
diff --git a/modelscope/models/multi_modal/clip/clip_bert.py b/modelscope/models/multi_modal/clip/clip_bert.py
deleted file mode 100644
index 24ccc1fa..00000000
--- a/modelscope/models/multi_modal/clip/clip_bert.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import torch.nn as nn
-from transformers import BertConfig, BertForMaskedLM
-
-
-class TextTransformer(nn.Module):
-
-    def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True):
-        super(TextTransformer, self).__init__()
-        bert_config = BertConfig.from_dict(config_dict)
-        if use_grad_ckp:
-            bert_config.gradient_checkpointing = True
-
-        self.bert = BertForMaskedLM(bert_config).bert
-
-        self.projector = nn.Linear(
-            bert_config.hidden_size, feat_dim, bias=False)
-
-    def forward(self, input_ids, attention_mask):
-        trans_features = {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask
-        }
-
-        output_states = self.bert(**trans_features, return_dict=False)
-        output_tokens = output_states[0]
-
-        cls_tokens = output_tokens[:, 0, :]
-
-        return self.projector(cls_tokens)
diff --git a/modelscope/models/multi_modal/clip/clip_model.py b/modelscope/models/multi_modal/clip/clip_model.py
deleted file mode 100644
index e092f4af..00000000
--- a/modelscope/models/multi_modal/clip/clip_model.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from typing import Any, Dict
-
-import cv2
-import json
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from PIL import Image
-from tokenizers import BertWordPieceTokenizer
-from torch.distributed.nn.functional import \
-    all_gather as all_gather_with_backprop
-from torchvision.transforms import Compose, Normalize, Resize, ToTensor
-
-from modelscope.metainfo import Models
-from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.models.multi_modal.clip.clip_bert import TextTransformer
-from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer
-from modelscope.utils.constant import ModeKeys, Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['CLIPForMultiModalEmbedding']
-
-
-class CLIPModel(nn.Module):
-
-    def __init__(self, model_dir):
-        super(CLIPModel, self).__init__()
-        # including vision config and text config
-        model_config = json.load(
-            open('{}/encoder_config.json'.format(model_dir)))
-
-        # vision encoder
-        vision_config = model_config['vision_config']
-        self.img_size = vision_config['input_resolution']
-        self.vision_encoder = VisionTransformer(
-            input_resolution=self.img_size,
-            patch_size=vision_config['patch_size'],
-            width=vision_config['width'],
-            layers=vision_config['layers'],
-            heads=vision_config['heads'],
-            output_dim=vision_config['feat_dim'],
-            use_grad_ckp=True)
-
-        # text encoder
-        text_config = model_config['text_config']
-        self.text_encoder = TextTransformer(
-            text_config['bert_config'], feat_dim=text_config['feat_dim'])
-
-        self.logit_scale = nn.Parameter(torch.ones([]) * 4.6)
-
-    def contrastive_loss(self, logits, dim):
-        neg_ce = torch.diag(F.log_softmax(logits, dim=dim))
-        return -neg_ce.mean()
-
-    def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None):
-        if img_idx is not None and all_img_idx is not None:
-            with torch.no_grad():
-                false_neg_indicator = (
-                    img_idx[:, None] == all_img_idx[None, :])
-                false_neg_indicator.fill_diagonal_(False)
-            t2i_sim.masked_fill_(false_neg_indicator, float('-inf'))
-            i2t_sim.masked_fill_(false_neg_indicator, float('-inf'))
-            caption_loss = self.contrastive_loss(t2i_sim, dim=1)
-            image_loss = self.contrastive_loss(i2t_sim, dim=1)
-        else:
-            caption_loss = self.contrastive_loss(t2i_sim, dim=1)
-            image_loss = self.contrastive_loss(i2t_sim, dim=1)
-        return (caption_loss + image_loss) / 2.0
-
-    def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor,
-                 img_id_list):
-        img_feat = self.forward(img_tensor, input_type='img')
-        text_feat = self.forward((text_ids_tensor, text_masks_tensor),
-                                 input_type='text')
-
-        global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0)
-        global_text_feat = torch.cat(
-            all_gather_with_backprop(text_feat), dim=0)
-        global_img_id_list = torch.cat(
-            all_gather_with_backprop(img_id_list), dim=0)
-
-        t2i_sim_mat = text_feat @ global_img_feat.t()
-        i2t_sim_mat = img_feat @ global_text_feat.t()
-
-        logit_scale = self.logit_scale.exp().clamp(max=100.0)
-        t2i_sim_mat_logits = t2i_sim_mat * logit_scale
-        i2t_sim_mat_logits = i2t_sim_mat * logit_scale
-
-        loss = self.clip_loss(
-            t2i_sim_mat_logits,
-            i2t_sim_mat_logits,
-            img_idx=img_id_list,
-            all_img_idx=global_img_id_list)
-
-        return loss
-
-    def forward(self, input_data, input_type):
-        if input_type == 'img':
-            img_embedding = self.vision_encoder(input_data)
-            img_embedding = F.normalize(img_embedding, p=2.0, dim=1)
-            return img_embedding
-        elif input_type == 'text':
-            text_ids_tensor, text_mask_tensor = input_data
-            text_embedding = self.text_encoder(text_ids_tensor,
-                                               text_mask_tensor)
-            text_embedding = F.normalize(text_embedding, p=2.0, dim=1)
-            return text_embedding
-        elif input_type == ModeKeys.TRAIN:
-            return self.get_loss(*input_data)
-        else:
-            raise ValueError('Unknown input type')
-
-
-@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
-class CLIPForMultiModalEmbedding(TorchModel):
-
-    def __init__(self, model_dir, device_id=-1):
-        super().__init__(model_dir=model_dir, device_id=device_id)
-        self.clip_model = CLIPModel(model_dir=model_dir)
-        pretrained_params = torch.load(
-            '{}/pytorch_model.bin'.format(model_dir), 'cpu')
-        self.clip_model.load_state_dict(pretrained_params)
-        self.clip_model.eval()
-
-        self.device_id = device_id
-        if self.device_id >= 0:
-            self.clip_model.to('cuda:{}'.format(self.device_id))
-            logger.info('Use GPU: {}'.format(self.device_id))
-        else:
-            logger.info('Use CPU for inference')
-
-        # image preprocessor
-        norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
-                            (0.26862954, 0.26130258, 0.27577711))
-        self.img_preprocessor = Compose([
-            Resize((self.clip_model.img_size, self.clip_model.img_size),
-                   interpolation=Image.BICUBIC),
-            ToTensor(), norm_op
-        ])
-
-        # text tokenizer
-        vocab_path = '{}/vocab.txt'.format(model_dir)
-        self.text_tokenizer = BertWordPieceTokenizer(
-            vocab_path, lowercase=False)
-        self.text_tokenizer.enable_truncation(max_length=30)
-
-    def tokenize_text(self, text_str):
-        tokens = self.text_tokenizer.encode(text_str)
-        max_tokens = 30
-        text_ids_tensor = torch.zeros((1, max_tokens)).long()
-        text_mask_tensor = torch.zeros((1, max_tokens))
-
-        text_ids, text_mask = tokens.ids, tokens.attention_mask
-        text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
-        text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)
-
-        return text_ids_tensor, text_mask_tensor
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        from modelscope.outputs import OutputKeys
-        output = {
-            OutputKeys.IMG_EMBEDDING: None,
-            OutputKeys.TEXT_EMBEDDING: None
-        }
-        if 'img' in input and input['img'] is not None:
-            input_img = input['img']
-            if isinstance(input_img, Image.Image):
-                img_tensor = self.img_preprocessor(input_img)[None, ...]
-            elif isinstance(input_img, np.ndarray):
-                if len(input_img.shape) == 2:
-                    input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR)
-                input_img = input_img[:, :, ::-1]  # in rgb order
-                input_img = Image.fromarray(
-                    input_img.astype('uint8')).convert('RGB')
-                img_tensor = self.img_preprocessor(input_img)[None, ...]
-            else:
-                raise TypeError(
-                    f'img should be either PIL.Image or np.array, but got {type(input_img)}'
-                )
-
-            if self.device_id >= 0:
-                img_tensor = img_tensor.to('cuda:{}'.format(self.device_id))
-
-            img_embedding = self.clip_model(
-                input_data=img_tensor, input_type='img')
-            from modelscope.outputs import OutputKeys
-            output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy()
-
-        if 'text' in input and input['text'] is not None:
-            text_str = input['text']
-            if isinstance(text_str, str):
-                text_ids_tensor, text_mask_tensor = self.tokenize_text(
-                    text_str)
-            else:
-                raise TypeError(
-                    f'text should be str, but got {type(text_str)}')
-
-            if self.device_id >= 0:
-                text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
-                    self.device_id))
-                text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
-                    self.device_id))
-
-            text_embedding = self.clip_model(
-                input_data=(text_ids_tensor, text_mask_tensor),
-                input_type='text')
-            output['text_embedding'] = text_embedding.data.cpu().numpy()
-
-        return output
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return inputs
diff --git a/modelscope/models/multi_modal/clip/clip_vit.py b/modelscope/models/multi_modal/clip/clip_vit.py
deleted file mode 100644
index cfe67426..00000000
--- a/modelscope/models/multi_modal/clip/clip_vit.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2021 The OpenAI CLIP Authors. All rights reserved.
-
-from collections import OrderedDict
-from typing import Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-from torch import nn
-
-
-class LayerNorm(nn.LayerNorm):
-    """Subclass torch's LayerNorm to handle fp16."""
-
-    def forward(self, x: torch.Tensor):
-        orig_type = x.dtype
-        ret = super().forward(x.type(torch.float32))
-        return ret.type(orig_type)
-
-
-class QuickGELU(nn.Module):
-
-    def forward(self, x: torch.Tensor):
-        return x * torch.sigmoid(1.702 * x)
-
-
-class ResidualAttentionBlock(nn.Module):
-
-    def __init__(self,
-                 d_model: int,
-                 n_head: int,
-                 attn_mask: torch.Tensor = None):
-        super().__init__()
-
-        self.attn = nn.MultiheadAttention(d_model, n_head)
-        self.ln_1 = LayerNorm(d_model)
-        self.mlp = nn.Sequential(
-            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
-                         ('gelu', QuickGELU()),
-                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
-        self.ln_2 = LayerNorm(d_model)
-        self.attn_mask = attn_mask
-
-    def attention(self, x: torch.Tensor):
-        self.attn_mask = self.attn_mask.to(
-            dtype=x.dtype,
-            device=x.device) if self.attn_mask is not None else None
-        return self.attn(
-            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
-
-    def forward(self, x: torch.Tensor):
-        x = x + self.attention(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-
-class Transformer(nn.Module):
-
-    def __init__(self,
-                 width: int,
-                 layers: int,
-                 heads: int,
-                 attn_mask: torch.Tensor = None,
-                 use_grad_ckp: bool = True):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.resblocks = nn.Sequential(*[
-            ResidualAttentionBlock(width, heads, attn_mask)
-            for _ in range(layers)
-        ])
-
-        self.use_grad_ckp = use_grad_ckp
-
-    def forward(self, x: torch.Tensor):
-        if self.use_grad_ckp:
-            for each_block in self.resblocks:
-                x = checkpoint.checkpoint(each_block, x)
-            return x
-        else:
-            return self.resblocks(x)
-
-
-class VisionTransformer(nn.Module):
-
-    def __init__(self, input_resolution: int, patch_size: int, width: int,
-                 layers: int, heads: int, output_dim: int, use_grad_ckp: bool):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(
-            in_channels=3,
-            out_channels=width,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=False)
-
-        scale = width**-0.5
-        self.class_embedding = nn.Parameter(scale * torch.randn(width))
-        self.positional_embedding = nn.Parameter(scale * torch.randn(
-            (input_resolution // patch_size)**2 + 1, width))
-        self.ln_pre = LayerNorm(width)
-
-        self.transformer = Transformer(
-            width, layers, heads, use_grad_ckp=use_grad_ckp)
-
-        self.ln_post = LayerNorm(width)
-        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
-
-    def forward(self, x: torch.Tensor):
-        x = self.conv1(x)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1],
-                      -1)  # shape = [*, width, grid ** 2]
-        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-        class_embeddings = self.class_embedding.to(x.dtype) + \
-            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
-        x = torch.cat([class_embeddings, x], dim=1)
-        x = x + self.positional_embedding.to(x.dtype)
-        x = self.ln_pre(x)
-
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-
-        x = self.ln_post(x[:, 0, :])
-
-        if self.proj is not None:
-            x = x @ self.proj
-
-        return x
diff --git a/modelscope/models/multi_modal/clip/configuration_bert.py b/modelscope/models/multi_modal/clip/configuration_bert.py
new file mode 100644
index 00000000..b75f5db8
--- /dev/null
+++ b/modelscope/models/multi_modal/clip/configuration_bert.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class BertConfig(object):
+    r"""
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 output_attentions=False,
+                 output_hidden_states=False):
+        self.vocab_size = vocab_size_or_config_json_file
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py
new file mode 100644
index 00000000..2fb0d7e3
--- /dev/null
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -0,0 +1,677 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, Iterable, List, Tuple, Union
+
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from torchvision.transforms import Compose, Normalize, Resize, ToTensor
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
+from modelscope.models.multi_modal.clip.configuration_bert import BertConfig
+from modelscope.models.multi_modal.clip.modeling_bert import BertModel
+from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['CLIPForMultiModalEmbedding']
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1],
+                      x.shape[2] * x.shape[3]).permute(2, 0,
+                                                       1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+
+        return x[0]
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
+                             (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisualTransformer(nn.Module):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat(
+            [  # noqa
+                self.class_embedding.to(x.dtype) + torch.zeros(  # noqa
+                    x.shape[0],
+                    1,
+                    x.shape[-1],
+                    dtype=x.dtype,
+                    device=x.device),
+                x  # noqa
+            ],
+            dim=1)  # noqa shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        # vision
+        image_resolution: int,
+        vision_layers: Union[Tuple[int, int, int, int], int],
+        vision_width: int,
+        vision_patch_size: int,
+        # text
+        vocab_size: int,
+        text_attention_probs_dropout_prob: float,
+        text_hidden_act: str,
+        text_hidden_dropout_prob: float,
+        text_hidden_size: int,
+        text_initializer_range: float,
+        text_intermediate_size: int,
+        text_max_position_embeddings: int,
+        text_num_attention_heads: int,
+        text_num_hidden_layers: int,
+        text_type_vocab_size: int,
+        tokenizer: FullTokenizer,
+    ):
+        super().__init__()
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim)
+
+        self.bert_config = BertConfig(
+            vocab_size_or_config_json_file=vocab_size,
+            hidden_size=text_hidden_size,
+            num_hidden_layers=text_num_hidden_layers,
+            num_attention_heads=text_num_attention_heads,
+            intermediate_size=text_intermediate_size,
+            hidden_act=text_hidden_act,
+            hidden_dropout_prob=text_hidden_dropout_prob,
+            attention_probs_dropout_prob=text_attention_probs_dropout_prob,
+            max_position_embeddings=text_max_position_embeddings,
+            type_vocab_size=text_type_vocab_size,
+            initializer_range=text_initializer_range,
+            layer_norm_eps=1e-12,
+        )
+        self.bert = BertModel(self.bert_config)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(text_hidden_size, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.tokenizer = tokenizer
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith('bn3.weight'):
+                        nn.init.zeros_(param)
+
+        if self.text_projection is not None:
+            nn.init.normal_(
+                self.text_projection, std=self.bert_config.hidden_size**-0.5)
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text):
+        pad_index = self.tokenizer.vocab['[PAD]']
+        attn_mask = text.ne(pad_index).type(self.dtype)
+        x = self.bert(
+            text, attention_mask=attn_mask)[0].type(
+                self.dtype)  # [batch_size, seq_length, hidden_size]
+        return x[:, 0, :] @ self.text_projection
+
+    def forward(self, image, text):
+        assert image is not None or text is not None, 'text and image cannot both be None!'
+
+        if image is None:
+            return self.encode_text(text)
+        elif text is None:
+            return self.encode_image(image)
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        image_features = image_features / image_features.norm(
+            dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+
+        return image_features, text_features, self.logit_scale.exp()
+
+    def get_similarity(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_models_to_fp32(model):
+    for p in model.parameters():
+        p.data = p.data.float()
+        if p.grad:
+            p.grad.data = p.grad.data.float()
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(module):
+        if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            module.weight.data = module.weight.data.half()
+            if module.bias is not None:
+                module.bias.data = module.bias.data.half()
+
+        if isinstance(module, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(module, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        if isinstance(module, BertModel):
+            module.to(torch.half)
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(module, name):
+                attr = getattr(module, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+
+
+def image_transform(image_size=224):
+    transform = Compose([
+        _convert_to_rgb,
+        Resize((image_size, image_size)),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+    return transform
+
+
+@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
+class CLIPForMultiModalEmbedding(TorchModel):
+
+    def __init__(self, model_dir, device_id=-1):
+        super().__init__(model_dir=model_dir, device_id=device_id)
+
+        # Initialize the model.
+        vision_model_config_file = '{}/vision_model_config.json'.format(
+            model_dir)
+        logger.info(
+            f'Loading vision model config from {vision_model_config_file}')
+        assert os.path.exists(vision_model_config_file)
+
+        text_model_config_file = '{}/text_model_config.json'.format(model_dir)
+        logger.info(f'Loading text model config from {text_model_config_file}')
+        assert os.path.exists(text_model_config_file)
+
+        with open(vision_model_config_file,
+                  'r') as fv, open(text_model_config_file, 'r') as ft:
+            model_info = json.load(fv)
+            for k, v in json.load(ft).items():
+                model_info[k] = v
+
+        # image preprocess
+        self.img_preprocess = image_transform(model_info['image_resolution'])
+
+        # text tokenizer
+        vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
+        self.tokenizer = FullTokenizer(vocab_file=vocab_file)
+
+        # initialize the model
+        self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer)
+        convert_weights(self.clip_model)
+
+        # restore the pretrained weight
+        checkpoint = torch.load(
+            f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu')
+        sd = checkpoint['state_dict']
+        if next(iter(sd.items()))[0].startswith('module'):
+            sd = {k[len('module.'):]: v for k, v in sd.items()}
+        self.clip_model.load_state_dict(sd)
+        self.clip_model.eval()
+
+        # place the model
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        if self.device == 'cuda':
+            self.clip_model.to(self.device)
+            logger.info('Use GPU for inference')
+        else:
+            self.clip_model.float()
+            logger.info('Use CPU for inference')
+
+    def tokenize(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 52) -> torch.LongTensor:
+        """
+        Returns the tokenized representation of given input string(s)
+        Parameters
+        ----------
+        texts : Union[str, List[str]]
+            An input string or a list of input strings to tokenize
+        context_length : int
+            The context length to use; all baseline models use 24 as the context length
+        Returns
+        -------
+        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+
+        all_tokens = []
+        for text in texts:
+            all_tokens.append(
+                [self.tokenizer.vocab['[CLS]']]
+                + self.tokenizer.convert_tokens_to_ids(
+                    self.tokenizer.tokenize(text))[:context_length - 2]
+                + [self.tokenizer.vocab['[SEP]']])
+
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+        for i, tokens in enumerate(all_tokens):
+            assert len(tokens) <= context_length
+            result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        from modelscope.outputs import OutputKeys
+        output = {
+            OutputKeys.IMG_EMBEDDING: None,
+            OutputKeys.TEXT_EMBEDDING: None
+        }
+        if 'img' in input and input['img'] is not None:
+            image_input = input['img']
+
+            # single image input
+            if isinstance(image_input, Image.Image):
+                image_tensor = self.img_preprocess(image_input).unsqueeze(0)
+            # multi images input
+            elif isinstance(image_input, list):
+                if all([isinstance(elem, Image.Image)
+                        for elem in image_input]):
+                    image_tensor = torch.stack(
+                        [self.img_preprocess(elem) for elem in image_input],
+                        dim=0)
+                else:
+                    unsupported_elem_type = [
+                        type(elem) for elem in image_input
+                        if not isinstance(elem, Image.Image)
+                    ][0]
+                    raise TypeError(
+                        f'img should be PIL.Image or List[PIL.Image], \
+                            but got a List containing one {unsupported_elem_type}'
+                    )
+            # others
+            else:
+                raise TypeError(
+                    f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
+                )
+
+            image_tensor = image_tensor.to(self.device)
+
+            with torch.no_grad():
+                image_features = self.clip_model.encode_image(image_tensor)
+                image_features /= image_features.norm(
+                    dim=-1, keepdim=True)  # l2-normalize
+
+            output[OutputKeys.IMG_EMBEDDING] = image_features
+
+        if 'text' in input and input['text'] is not None:
+            text_input = input['text']
+
+            # single text input
+            if isinstance(text_input, str):
+                text_tensor = self.tokenize(text_input)
+            # multi texts input
+            elif isinstance(text_input, list):
+                if all([isinstance(elem, str) for elem in text_input]):
+                    text_tensor = self.tokenize(text_input)
+                else:
+                    unsupported_elem_type = [
+                        type(elem) for elem in text_input
+                        if not isinstance(elem, str)
+                    ][0]
+                    raise TypeError(
+                        f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
+                    )
+            # others
+            else:
+                raise TypeError(
+                    f'text should be str or List[str], but got {type(text_input)}'
+                )
+
+            text_tensor = text_tensor.to(self.device)
+
+            with torch.no_grad():
+                text_features = self.clip_model.encode_text(text_tensor)
+                text_features /= text_features.norm(
+                    dim=-1, keepdim=True)  # l2-normalize
+            output[OutputKeys.TEXT_EMBEDDING] = text_features
+
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
+    @property
+    def temperature(self):
+        return 1.0 / self.clip_model.logit_scale.exp()
diff --git a/modelscope/models/multi_modal/clip/modeling_bert.py b/modelscope/models/multi_modal/clip/modeling_bert.py
new file mode 100644
index 00000000..b5f104ce
--- /dev/null
+++ b/modelscope/models/multi_modal/clip/modeling_bert.py
@@ -0,0 +1,507 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import logging
+import math
+import os
+import sys
+from io import open
+
+import json
+import torch
+from torch import nn
+
+from .configuration_bert import BertConfig
+
+logger = logging.getLogger(__name__)
+
+
+def gelu(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(
+        math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {
+    'gelu': gelu,
+    'relu': torch.nn.functional.relu,
+    'swish': swish,
+    'gelu_new': gelu_new
+}
+
+BertLayerNorm = torch.nn.LayerNorm
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            position_ids = torch.arange(
+                seq_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if self.output_attentions else (
+                       context_layer, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
+        self_outputs = self.self(input_tensor, attention_mask, head_mask)
+        attention_output = self.output(self_outputs[0], input_tensor)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act,
+                      str) or (sys.version_info[0] == 2
+                               and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask,
+                                           head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output, ) + attention_outputs[
+            1:]  # add attentions if we output them
+        return outputs
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = nn.ModuleList(
+            [BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_outputs = layer_module(hidden_states, attention_mask,
+                                         head_mask[i])
+            hidden_states = layer_outputs[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1], )
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        outputs = (hidden_states, )
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states, )
+        if self.output_attentions:
+            outputs = outputs + (all_attentions, )
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act,
+                      str) or (sys.version_info[0] == 2
+                               and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(nn.Module):
+    config_class = BertConfig
+    base_model_prefix = 'bert'
+
+    def __init__(self, config):
+        super(BertPreTrainedModel, self).__init__()
+        self.config = config
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer)
+            of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax,
+            used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+
+        self.apply(self._init_weights)
+
+    def forward(self,
+                input_ids,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
+                    -1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
+                                             -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
+                    -1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters(
+            )).dtype)  # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(
+            input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(
+            embedding_output, extended_attention_mask, head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (
+            sequence_output,
+            pooled_output,
+        ) + encoder_outputs[
+            1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py
index 4d61e2d1..8617b8dd 100644
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -136,7 +136,7 @@ class DiffusionForTextToImageSynthesis(Model):
         self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024
 
         # text tokenizer
-        vocab_path = '{}/vocab.txt'.format(model_dir)
+        vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}'
         self.tokenizer = Tokenizer(vocab_file=vocab_path, seq_len=64)
 
         # diffusion process
diff --git a/modelscope/models/multi_modal/gemm/gemm_base.py b/modelscope/models/multi_modal/gemm/gemm_base.py
index 26eea0d5..db928212 100644
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -491,7 +491,9 @@ class GEVL(nn.Module):
             gen_logits = self.to_logits(out_embs[-1:, ...])
             probs = F.softmax(self.gen_logit_scale.exp() * gen_logits, dim=-1)
             pred = torch.argmax(
-                probs * (1.0 + torch.rand_like(probs)), axis=-1)
+                probs * (2.0 + torch.rand_like(probs)), axis=-1)
+            if int(pred) >= eot_token or int(pred) <= 0:
+                break
             pred_tokens.append(pred)
             text_input = torch.cat(
                 [text_input, pred.permute(1, 0).contiguous()], axis=1)
@@ -500,8 +502,6 @@ class GEVL(nn.Module):
         for out_tokens in pred_text_tokens:
             tokens = []
             for x in out_tokens:
-                if x >= eot_token or x <= 0:
-                    break
                 tokens.append(int(x))
             out_text = self.tokenizer.decode(tokens)
             out_text = out_text.strip()
diff --git a/modelscope/models/multi_modal/mplug/__init__.py b/modelscope/models/multi_modal/mplug/__init__.py
index bca5849b..955c87e2 100644
--- a/modelscope/models/multi_modal/mplug/__init__.py
+++ b/modelscope/models/multi_modal/mplug/__init__.py
@@ -14,5 +14,4 @@
 # limitations under the License.
 
 from .configuration_mplug import MPlugConfig
-from .modeling_mplug import (CONFIG_NAME, VOCAB_NAME,
-                             MPlugForVisualQuestionAnswering)
+from .modeling_mplug import CONFIG_NAME, MPlug
diff --git a/modelscope/models/multi_modal/mplug/clip/clip.py b/modelscope/models/multi_modal/mplug/clip/clip.py
index fbdfbd29..aa56e39b 100644
--- a/modelscope/models/multi_modal/mplug/clip/clip.py
+++ b/modelscope/models/multi_modal/mplug/clip/clip.py
@@ -5,9 +5,69 @@ from typing import Tuple, Union
 
 import torch
 import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
 from torch import nn
 
-from modelscope.models.multi_modal.clip.clip_vit import Transformer
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 use_grad_ckp: bool = True):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+        self.use_grad_ckp = use_grad_ckp
+
+    def forward(self, x: torch.Tensor):
+        if self.use_grad_ckp:
+            for each_block in self.resblocks:
+                x = checkpoint.checkpoint(each_block, x)
+            return x
+        else:
+            return self.resblocks(x)
 
 
 class Bottleneck(nn.Module):
diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py
index 6b2914c4..c275ed15 100644
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -15,14 +15,14 @@
 # limitations under the License.
 """ MPLUG model configuration """
 import os
-from collections import OrderedDict
-from typing import Any, Dict, Mapping, Union
+from typing import Any, Dict, Union
 
 import yaml
 from transformers import PretrainedConfig
-from transformers.onnx import OnnxConfig
 from transformers.utils import logging
 
+from modelscope.utils.constant import Tasks
+
 logger = logging.get_logger(__name__)
 
 
@@ -32,6 +32,7 @@ class MPlugConfig(PretrainedConfig):
 
     def __init__(
             self,
+            task=Tasks.visual_question_answering,
             bert_config='config_bert.json',
             image_res=504,
             batch_size_train=128,
@@ -64,7 +65,9 @@ class MPlugConfig(PretrainedConfig):
             clip_transformer_heads=12,
             clip_transformer_layers=12,
             **kwargs):
+
         super().__init__(**kwargs)
+        self.task = task
         self.bert_config = bert_config
         self.image_res = image_res
         self.batch_size_train = batch_size_train
@@ -103,23 +106,3 @@ class MPlugConfig(PretrainedConfig):
         with open(yaml_file, 'r') as reader:
             config_dict = yaml.load(reader, Loader=yaml.Loader)
         return cls(**config_dict)
-
-
-class MPlugOnnxConfig(OnnxConfig):
-
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        return OrderedDict([
-            ('input_ids', {
-                0: 'batch',
-                1: 'sequence'
-            }),
-            ('attention_mask', {
-                0: 'batch',
-                1: 'sequence'
-            }),
-            ('token_type_ids', {
-                0: 'batch',
-                1: 'sequence'
-            }),
-        ])
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 0b45ea12..50622cc0 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -42,14 +42,13 @@ from transformers.utils import logging
 
 from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
 from modelscope.models.multi_modal.mplug.predictor import TextGenerator
+from modelscope.utils.constant import ModelFile
 
 transformers.logging.set_verbosity_error()
 
 logger = logging.get_logger(__name__)
 
 CONFIG_NAME = 'config.yaml'
-WEIGHTS_NAME = 'pytorch_model.bin'
-VOCAB_NAME = 'vocab.txt'
 
 _CONFIG_FOR_DOC = 'BertConfig'
 _TOKENIZER_FOR_DOC = 'BertTokenizer'
@@ -1726,32 +1725,145 @@ class BertLMHeadModel(BertPreTrainedModel):
         return reordered_past
 
 
-class MPlugForVisualQuestionAnswering(PreTrainedModel):
+class BertPrefixModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(
+        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint='bert-base-uncased',
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=True,
+        reduction='mean',
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+        if soft_labels is not None:
+            loss_distill = -torch.sum(
+                F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels,
+                dim=-1)
+            loss_distill = loss_distill[labels != -100].mean()
+            lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+class MPlug(PreTrainedModel):
     config_class = MPlugConfig
 
     def __init__(self, config):
         super().__init__(config)
         self.config = config
         self.tokenizer = BertTokenizer.from_pretrained(
-            os.path.join(config.model_dir, VOCAB_NAME))
+            os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
         self.module_setting(config)
         self.visual_encoder = self._initialize_clip(config)
         self.text_encoder = BertModel(
             self.config_encoder, add_pooling_layer=False)
         self.fusion_encoder = FusionModel(
             self.config_fusion, add_pooling_layer=False)
-        self.text_decoder = BertLMHeadModel(self.config_decoder)
-        self.init_distill(config)
-        self.beam_generator = TextGenerator(config, self.text_decoder)
 
     @classmethod
     def from_pretrained(cls, model_dir, load_checkpoint=True):
-        config = MPlugConfig.from_yaml_file(
+        from modelscope.utils.constant import Tasks
+
+        task_mapping = {
+            Tasks.visual_question_answering: MPlugForVisualQuestionAnswering,
+            Tasks.image_captioning: MPLUGForImageCaption
+        }
+        config = cls.config_class.from_yaml_file(
             os.path.join(model_dir, CONFIG_NAME))
         config.model_dir = model_dir
-        model = cls(config)
+        model = task_mapping[config.task](config)
         if load_checkpoint:
-            checkpoint_path = os.path.join(model_dir, WEIGHTS_NAME)
+            checkpoint_path = os.path.join(model_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
             checkpoint = torch.load(checkpoint_path, map_location='cpu')
             if 'model' in checkpoint:
                 state_dict = checkpoint['model']
@@ -1803,6 +1915,161 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel):
         clip_model.visual.positional_embedding = pos_embed
         return clip_model
 
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def module_setting(self, config):
+        bert_config_path = os.path.join(config.model_dir, config.bert_config)
+        self.config_encoder = BertConfig.from_json_file(bert_config_path)
+        self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
+        self.config_fusion = BertConfig.from_json_file(bert_config_path)
+        self.config_decoder = BertConfig.from_json_file(bert_config_path)
+        self.config_decoder.add_cross_attention = True
+        self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
+        self.large = False
+        if self.config_encoder.hidden_size != config.vision_width:
+            self.visn_fc = nn.Linear(config.vision_width,
+                                     self.config_encoder.hidden_size)
+            self.visn_layer_norm = nn.LayerNorm(
+                self.config_encoder.hidden_size, eps=1e-12)
+            self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob)
+            self.large = True
+
+    @torch.no_grad()
+    def copy_params(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(),
+                                      model_pair[1].parameters()):
+                param_m.data.copy_(param.data)  # initialize
+                param_m.requires_grad = False  # not update by gradient
+
+    @torch.no_grad()
+    def _momentum_update(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(),
+                                      model_pair[1].parameters()):
+                param_m.data = param_m.data * self.momentum + param.data * (
+                    1. - self.momentum)
+
+    def generation(self, question_states, question_atts, out_size=1):
+        encoder_inputs = [question_states, question_atts]
+        topk_ids, topk_scores = self.beam_generator.translate_batch(
+            encoder_inputs, out_size=out_size)
+        return topk_ids, topk_scores
+
+    @staticmethod
+    def _tile(x, dim, n_tile):
+        import numpy as np
+        init_dim = x.size(dim)
+        repeat_idx = [1] * x.dim()
+        repeat_idx[dim] = n_tile
+        x = x.repeat(*(repeat_idx))
+        order_index = torch.LongTensor(
+            np.concatenate(
+                [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
+        return torch.index_select(x, dim, order_index.to(x.device))
+
+    def rank_answer(self, question_states, question_atts, answer_ids,
+                    answer_atts, k):
+
+        num_ques = question_states.size(0)
+        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token
+
+        start_output = self.text_decoder(
+            start_ids,
+            encoder_hidden_states=question_states,
+            encoder_attention_mask=question_atts,
+            return_dict=True,
+            reduction='none')
+        logits = start_output.logits[:, 0, :]  # first token's logit
+
+        # topk_probs: top-k probability
+        # topk_ids: [num_question, k]
+        answer_first_token = answer_ids[:, 1]
+        prob_first_token = F.softmax(
+            logits, dim=1).index_select(
+                dim=1, index=answer_first_token)
+        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
+
+        # answer input: [num_question*k, answer_len]
+        input_ids = []
+        input_atts = []
+        for b, topk_id in enumerate(topk_ids):
+            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
+            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
+        input_ids = torch.cat(input_ids, dim=0)
+        input_atts = torch.cat(input_atts, dim=0)
+
+        targets_ids = input_ids.masked_fill(
+            input_ids == self.tokenizer.pad_token_id, -100)
+
+        # repeat encoder's output for top-k answers
+        question_states = self._tile(question_states, 0, k)
+        question_atts = self._tile(question_atts, 0, k)
+
+        output = self.text_decoder(
+            input_ids,
+            attention_mask=input_atts,
+            encoder_hidden_states=question_states,
+            encoder_attention_mask=question_atts,
+            labels=targets_ids,
+            return_dict=True,
+            reduction='none')
+
+        answer_loss = output.loss
+        answer_loss = answer_loss.view(input_ids.size(0), -1)
+
+        # topk_prob: first token probability
+        topk_probs = topk_probs.view(-1, 1)
+        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
+
+        # re-calculate log probabilities for the answer sequences using chain rule
+        log_probs_sum = log_probs.sum(1)
+        log_probs_sum = log_probs_sum.view(num_ques, k)
+
+        topk_probs = F.softmax(log_probs_sum, dim=-1)
+        # get top-k after re-ranking
+        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
+        topk_ids = torch.gather(topk_ids, 1, rerank_id)
+
+        return topk_ids, topk_probs
+
+
+class MPlugForVisualQuestionAnswering(MPlug):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_decoder = BertLMHeadModel(self.config_decoder)
+        self.beam_generator = TextGenerator(config, self.text_decoder)
+        self.init_distill(config)
+
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = self._initialize_clip(config)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_decoder, self.text_decoder_m],
+            ]
+            if self.config_encoder.hidden_size != config.vision_width:
+                self.visn_fc_m = nn.Linear(config.vision_width,
+                                           self.config_encoder.hidden_size)
+                self.visn_layer_norm_m = nn.LayerNorm(
+                    self.config_encoder.hidden_size, eps=1e-12)
+                self.dropout_m = nn.Dropout(
+                    self.config_encoder.hidden_dropout_prob)
+                self.model_pairs.extend(
+                    [[self.visn_fc, self.visn_fc_m],
+                     [self.visn_layer_norm, self.visn_layer_norm_m]])
+            self.copy_params()
+            self.momentum = 0.995
+
     def forward(self,
                 image,
                 question,
@@ -1935,145 +2202,110 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel):
                                                    merge_text_attention)
             return topk_ids, topk_probs
 
-    def module_setting(self, config):
-        bert_config_path = os.path.join(config.model_dir, config.bert_config)
-        self.config_encoder = BertConfig.from_json_file(bert_config_path)
-        self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
-        self.config_fusion = BertConfig.from_json_file(bert_config_path)
-        self.config_decoder = BertConfig.from_json_file(bert_config_path)
-        self.config_decoder.add_cross_attention = True
-        self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
-        self.large = False
-        if self.config_encoder.hidden_size != config.vision_width:
-            self.visn_fc = nn.Linear(config.vision_width,
-                                     self.config_encoder.hidden_size)
-            self.visn_layer_norm = nn.LayerNorm(
-                self.config_encoder.hidden_size, eps=1e-12)
-            self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob)
-            self.large = True
-
-    def init_distill(self, config):
-        self.distill = config.distill
-        if self.distill:
-            self.visual_encoder_m = self._initialize_clip(config)
-            self.text_encoder_m = BertModel(
-                self.config_encoder, add_pooling_layer=False)
-            self.fusion_encoder_m = FusionModel(
-                self.config_fusion, add_pooling_layer=False)
-            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
-            self.model_pairs = [
-                [self.visual_encoder, self.visual_encoder_m],
-                [self.text_encoder, self.text_encoder_m],
-                [self.text_decoder, self.text_decoder_m],
-            ]
-            if self.config_encoder.hidden_size != config.vision_width:
-                self.visn_fc_m = nn.Linear(config.vision_width,
-                                           self.config_encoder.hidden_size)
-                self.visn_layer_norm_m = nn.LayerNorm(
-                    self.config_encoder.hidden_size, eps=1e-12)
-                self.dropout_m = nn.Dropout(
-                    self.config_encoder.hidden_dropout_prob)
-                self.model_pairs.extend(
-                    [[self.visn_fc, self.visn_fc_m],
-                     [self.visn_layer_norm, self.visn_layer_norm_m]])
-            self.copy_params()
-            self.momentum = 0.995
-
-    @torch.no_grad()
-    def copy_params(self):
-        for model_pair in self.model_pairs:
-            for param, param_m in zip(model_pair[0].parameters(),
-                                      model_pair[1].parameters()):
-                param_m.data.copy_(param.data)  # initialize
-                param_m.requires_grad = False  # not update by gradient
-
-    @torch.no_grad()
-    def _momentum_update(self):
-        for model_pair in self.model_pairs:
-            for param, param_m in zip(model_pair[0].parameters(),
-                                      model_pair[1].parameters()):
-                param_m.data = param_m.data * self.momentum + param.data * (
-                    1. - self.momentum)
-
-    def generation(self, question_states, question_atts):
-        encoder_inputs = [question_states, question_atts]
-        topk_ids, topk_scores = self.beam_generator.translate_batch(
-            encoder_inputs)
-        return topk_ids, topk_scores
-
-    @staticmethod
-    def _tile(x, dim, n_tile):
-        import numpy as np
-        init_dim = x.size(dim)
-        repeat_idx = [1] * x.dim()
-        repeat_idx[dim] = n_tile
-        x = x.repeat(*(repeat_idx))
-        order_index = torch.LongTensor(
-            np.concatenate(
-                [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
-        return torch.index_select(x, dim, order_index.to(x.device))
-
-    def rank_answer(self, question_states, question_atts, answer_ids,
-                    answer_atts, k):
-
-        num_ques = question_states.size(0)
-        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token
 
-        start_output = self.text_decoder(
-            start_ids,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            return_dict=True,
-            reduction='none')
-        logits = start_output.logits[:, 0, :]  # first token's logit
+class MPLUGForImageCaption(MPlug):
 
-        # topk_probs: top-k probability
-        # topk_ids: [num_question, k]
-        answer_first_token = answer_ids[:, 1]
-        prob_first_token = F.softmax(
-            logits, dim=1).index_select(
-                dim=1, index=answer_first_token)
-        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
-
-        # answer input: [num_question*k, answer_len]
-        input_ids = []
-        input_atts = []
-        for b, topk_id in enumerate(topk_ids):
-            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
-            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
-        input_ids = torch.cat(input_ids, dim=0)
-        input_atts = torch.cat(input_atts, dim=0)
-
-        targets_ids = input_ids.masked_fill(
-            input_ids == self.tokenizer.pad_token_id, -100)
-
-        # repeat encoder's output for top-k answers
-        question_states = self._tile(question_states, 0, k)
-        question_atts = self._tile(question_atts, 0, k)
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_decoder = BertPrefixModel(self.config_decoder)
+        self.beam_generator = TextGenerator(config, self.text_decoder)
 
-        output = self.text_decoder(
-            input_ids,
-            attention_mask=input_atts,
-            encoder_hidden_states=question_states,
-            encoder_attention_mask=question_atts,
-            labels=targets_ids,
-            return_dict=True,
-            reduction='none')
+    def beam_search(self,
+                    image,
+                    question,
+                    answer=None,
+                    train=True,
+                    out_size=5):
+        image_embeds = self.visual_encoder.visual(image, skip_last_layer=True)
+        if self.large:
+            image_embeds = self.dropout(
+                self.visn_layer_norm(self.visn_fc(image_embeds)))
+        image_atts = torch.ones(
+            image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+        text_output = self.text_encoder(
+            question.input_ids,
+            attention_mask=question.attention_mask,
+            return_dict=True)
+        text_embeds = text_output.last_hidden_state
+        fusion_output = self.fusion_encoder(
+            encoder_embeds=text_embeds,
+            attention_mask=question.attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=False)
+        image_output, question_output = fusion_output
+        question_output = torch.cat([image_output, question_output], 1)
+        merge_text_attention = torch.cat([image_atts, question.attention_mask],
+                                         1)
+        topk_ids, topk_probs = self.generation(
+            question_output, merge_text_attention, out_size=out_size)
+        return topk_ids, topk_probs
 
-        answer_loss = output.loss
-        answer_loss = answer_loss.view(input_ids.size(0), -1)
+    def forward(self,
+                image,
+                question,
+                answer=None,
+                train=True,
+                out_size=5,
+                scst=False):
+        if (scst):
+            return self.beam_search(
+                image, question, answer, train=True, out_size=out_size)
+        image = image.to(dtype=next(self.parameters()).dtype)
+        image_embeds = self.visual_encoder.visual(image, skip_last_layer=True)
+        if self.large:
+            image_embeds = self.dropout(
+                self.visn_layer_norm(self.visn_fc(image_embeds)))
+        image_atts = torch.ones(
+            image_embeds.size()[:-1], dtype=torch.long).to(image.device)
 
-        # topk_prob: first token probability
-        topk_probs = topk_probs.view(-1, 1)
-        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
+        if train:
+            answer_targets = answer.input_ids.masked_fill(
+                answer.input_ids == self.tokenizer.pad_token_id, -100)
+            text_output = self.text_encoder(
+                question.input_ids,
+                attention_mask=question.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            fusion_output = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=question.attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=False)
 
-        # re-calculate log probabilities for the answer sequences using chain rule
-        log_probs_sum = log_probs.sum(1)
-        log_probs_sum = log_probs_sum.view(num_ques, k)
+            image_output, question_output = fusion_output
 
-        topk_probs = F.softmax(log_probs_sum, dim=-1)
-        # get top-k after re-ranking
-        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
-        topk_ids = torch.gather(topk_ids, 1, rerank_id)
+            question_output = torch.cat([image_output, question_output], 1)
+            merge_text_attention = torch.cat(
+                [image_atts, question.attention_mask], 1)
 
-        return topk_ids, topk_probs
+            answer_output = self.text_decoder(
+                answer.input_ids,
+                attention_mask=answer.attention_mask,
+                encoder_hidden_states=question_output,
+                encoder_attention_mask=merge_text_attention,
+                labels=answer_targets,
+                return_dict=True,
+                reduction='none')
+            loss = answer_output.loss
+            return loss
+        else:
+            text_output = self.text_encoder(
+                question.input_ids,
+                attention_mask=question.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            fusion_output = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=question.attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=False)
+            image_output, question_output = fusion_output
+            question_output = torch.cat([image_output, question_output], 1)
+            merge_text_attention = torch.cat(
+                [image_atts, question.attention_mask], 1)
+            topk_ids, topk_probs = self.generation(question_output,
+                                                   merge_text_attention)
+            return topk_ids, topk_probs
diff --git a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
similarity index 60%
rename from modelscope/models/multi_modal/mplug_for_visual_question_answering.py
rename to modelscope/models/multi_modal/mplug_for_all_tasks.py
index 88875fda..bb5a9c46 100644
--- a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -6,12 +6,13 @@ from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks
 
-__all__ = ['MPlugForVisualQuestionAnswering']
+__all__ = ['MPlugForAllTasks']
 
 
 @MODELS.register_module(
     Tasks.visual_question_answering, module_name=Models.mplug)
-class MPlugForVisualQuestionAnswering(TorchModel):
+@MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug)
+class MPlugForAllTasks(TorchModel):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the mplug model from the `model_dir` path.
@@ -20,8 +21,8 @@ class MPlugForVisualQuestionAnswering(TorchModel):
         """
 
         super().__init__(model_dir, *args, **kwargs)
-        from modelscope.models.multi_modal.mplug import MPlugForVisualQuestionAnswering
-        self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir)
+        from modelscope.models.multi_modal.mplug import MPlug
+        self.model = MPlug.from_pretrained(model_dir)
         self.tokenizer = self.model.tokenizer
 
     def train(self):
@@ -44,4 +45,13 @@ class MPlugForVisualQuestionAnswering(TorchModel):
                     }
         """
 
-        return self.model(**input)[0]
+        topk_ids, _ = self.model(**input)
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
+                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
+                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+
+        pred_string = self.tokenizer.decode(topk_ids[0][0])
+        for _old, _new in replace_tokens_bert:
+            pred_string = pred_string.replace(_old, _new)
+        pred_string = pred_string.strip()
+        return pred_string
diff --git a/modelscope/models/multi_modal/ofa/tokenization_ofa.py b/modelscope/models/multi_modal/ofa/tokenization_ofa.py
index 158905eb..fd50505c 100644
--- a/modelscope/models/multi_modal/ofa/tokenization_ofa.py
+++ b/modelscope/models/multi_modal/ofa/tokenization_ofa.py
@@ -22,6 +22,8 @@ from transformers.models.bert.tokenization_bert import (BasicTokenizer,
                                                         WordpieceTokenizer)
 from transformers.utils import logging
 
+from modelscope.utils.constant import ModelFile
+
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'}
@@ -42,7 +44,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'ofa-base': 1024,
 }
 
-VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}
 
 PRETRAINED_VOCAB_FILES_MAP_ZH = {
     'vocab_file': {
diff --git a/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py b/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
index 03d2d71e..db11370d 100644
--- a/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
+++ b/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
@@ -20,6 +20,7 @@ from transformers import PreTrainedTokenizerFast
 from transformers.models.bart.tokenization_bart_fast import BartTokenizerFast
 from transformers.utils import logging
 
+from modelscope.utils.constant import ModelFile
 from .tokenization_ofa import OFATokenizer, OFATokenizerZH
 
 logger = logging.get_logger(__name__)
@@ -50,7 +51,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'ofa-base': 1024,
 }
 
-VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}
 
 PRETRAINED_VOCAB_FILES_MAP_ZH = {
     'vocab_file': {
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert.py b/modelscope/models/nlp/structbert/tokenization_sbert.py
index cbf98746..3171e31d 100644
--- a/modelscope/models/nlp/structbert/tokenization_sbert.py
+++ b/modelscope/models/nlp/structbert/tokenization_sbert.py
@@ -23,11 +23,12 @@ from typing import List, Optional, Tuple
 from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control,
                                              _is_punctuation, _is_whitespace)
 
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 
 logger = get_logger(__name__)
 
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}
 
 PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}
 
diff --git a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
index 5b8d79cc..a0a81121 100644
--- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
+++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
@@ -22,13 +22,14 @@ import transformers
 from tokenizers import normalizers
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from .tokenization_sbert import SbertTokenizer
 
 logger = get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'vocab_file': 'vocab.txt',
+    'vocab_file': ModelFile.VOCAB_FILE,
     'tokenizer_file': 'tokenizer.json'
 }
 
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 1e84dd8a..6e4486dd 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path,
                                        relative_to_absolute_path)
 
 from modelscope.msdatasets.config import MS_DATASETS_CACHE
+from modelscope.utils.config import ConfigDict
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
+from .task_datasets.builder import build_task_dataset
+from .utils.dataset_builder import ExternalDataset
 from .utils.dataset_utils import (get_dataset_files,
                                   get_target_dataset_structure,
                                   load_dataset_builder)
@@ -67,9 +70,16 @@ class MsDataset:
     def __len__(self):
         return len(self._hf_ds)
 
+    @property
+    def config_kwargs(self):
+        if isinstance(self._hf_ds, ExternalDataset):
+            return self._hf_ds.config_kwargs
+        else:
+            return None
+
     @classmethod
     def from_hf_dataset(cls,
-                        hf_ds: Union[Dataset, DatasetDict],
+                        hf_ds: Union[Dataset, DatasetDict, ExternalDataset],
                         target: str = None) -> Union[dict, 'MsDataset']:
         if isinstance(hf_ds, Dataset):
             return cls(hf_ds, target)
@@ -77,6 +87,8 @@ class MsDataset:
             if len(hf_ds.keys()) == 1:
                 return cls(next(iter(hf_ds.values())), target)
             return {k: cls(v, target) for k, v in hf_ds.items()}
+        elif isinstance(hf_ds, ExternalDataset):
+            return cls(hf_ds)
         else:
             raise TypeError(
                 f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
@@ -96,7 +108,8 @@ class MsDataset:
                                    Mapping[str, Union[str,
                                                       Sequence[str]]]]] = None,
         download_mode: Optional[DownloadMode] = DownloadMode.
-        REUSE_DATASET_IF_EXISTS
+        REUSE_DATASET_IF_EXISTS,
+        **config_kwargs,
     ) -> Union[dict, 'MsDataset']:
         """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
             Args:
@@ -113,6 +126,7 @@ class MsDataset:
                 hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
                 download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                                DownloadMode.REUSE_DATASET_IF_EXISTS
+                **config_kwargs (additional keyword arguments): Keyword arguments to be passed
 
             Returns:
                 MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
@@ -128,7 +142,8 @@ class MsDataset:
                 split=split,
                 data_dir=data_dir,
                 data_files=data_files,
-                download_mode=download_mode.value)
+                download_mode=download_mode.value,
+                **config_kwargs)
             return MsDataset.from_hf_dataset(dataset, target=target)
         elif hub == Hubs.modelscope:
             return MsDataset._load_ms_dataset(
@@ -140,22 +155,22 @@ class MsDataset:
                 split=split,
                 data_dir=data_dir,
                 data_files=data_files,
-                download_mode=download_mode)
+                download_mode=download_mode,
+                **config_kwargs)
 
     @staticmethod
-    def _load_ms_dataset(
-        dataset_name: Union[str, list],
-        namespace: Optional[str] = None,
-        target: Optional[str] = None,
-        version: Optional[str] = DEFAULT_DATASET_REVISION,
-        subset_name: Optional[str] = None,
-        split: Optional[str] = None,
-        data_dir: Optional[str] = None,
-        data_files: Optional[Union[str, Sequence[str],
-                                   Mapping[str, Union[str,
-                                                      Sequence[str]]]]] = None,
-        download_mode: Optional[DownloadMode] = None
-    ) -> Union[dict, 'MsDataset']:
+    def _load_ms_dataset(dataset_name: Union[str, list],
+                         namespace: Optional[str] = None,
+                         target: Optional[str] = None,
+                         version: Optional[str] = DEFAULT_DATASET_REVISION,
+                         subset_name: Optional[str] = None,
+                         split: Optional[str] = None,
+                         data_dir: Optional[str] = None,
+                         data_files: Optional[Union[
+                             str, Sequence[str],
+                             Mapping[str, Union[str, Sequence[str]]]]] = None,
+                         download_mode: Optional[DownloadMode] = None,
+                         **config_kwargs) -> Union[dict, 'MsDataset']:
         if isinstance(dataset_name, str):
             dataset_formation = DatasetFormations.native
             if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
@@ -184,7 +199,8 @@ class MsDataset:
                     data_dir=data_dir,
                     data_files=data_files,
                     cache_dir=MS_DATASETS_CACHE,
-                    download_mode=download_mode.value)
+                    download_mode=download_mode.value,
+                    **config_kwargs)
             else:
                 dataset = MsDataset._load_from_ms(
                     dataset_name,
@@ -195,7 +211,7 @@ class MsDataset:
                     subset_name=subset_name,
                     split=split,
                     download_mode=download_mode,
-                )
+                    **config_kwargs)
         elif isinstance(dataset_name, list):
             if target is None:
                 target = 'target'
@@ -206,16 +222,15 @@ class MsDataset:
         return MsDataset.from_hf_dataset(dataset, target=target)
 
     @staticmethod
-    def _load_from_ms(
-        dataset_name: str,
-        dataset_files: dict,
-        download_dir: str,
-        namespace: Optional[str] = None,
-        version: Optional[str] = DEFAULT_DATASET_REVISION,
-        subset_name: Optional[str] = None,
-        split: Optional[str] = None,
-        download_mode: Optional[DownloadMode] = None,
-    ) -> Union[Dataset, DatasetDict]:
+    def _load_from_ms(dataset_name: str,
+                      dataset_files: dict,
+                      download_dir: str,
+                      namespace: Optional[str] = None,
+                      version: Optional[str] = DEFAULT_DATASET_REVISION,
+                      subset_name: Optional[str] = None,
+                      split: Optional[str] = None,
+                      download_mode: Optional[DownloadMode] = None,
+                      **config_kwargs) -> Union[Dataset, DatasetDict]:
         for json_path in dataset_files['.json']:
             if json_path.endswith(f'{dataset_name}.json'):
                 with open(json_path, encoding='utf-8') as dataset_json_file:
@@ -226,7 +241,6 @@ class MsDataset:
         meta_map, file_map = get_dataset_files(target_dataset_structure,
                                                dataset_name, namespace,
                                                version)
-
         builder = load_dataset_builder(
             dataset_name,
             subset_name,
@@ -235,7 +249,8 @@ class MsDataset:
             zip_data_files=file_map,
             cache_dir=MS_DATASETS_CACHE,
             version=version,
-            split=list(target_dataset_structure.keys()))
+            split=list(target_dataset_structure.keys()),
+            **config_kwargs)
 
         download_config = DownloadConfig(
             cache_dir=download_dir,
@@ -253,7 +268,6 @@ class MsDataset:
             data_dir=download_dir,
         )
         builder.download_and_prepare(
-            download_config=download_config,
             dl_manager=dl_manager,
             download_mode=download_mode.value,
             try_from_hf_gcs=False)
@@ -338,6 +352,8 @@ class MsDataset:
         self,
         columns: Union[str, List[str]] = None,
         preprocessors: Union[Callable, List[Callable]] = None,
+        task_name: str = None,
+        task_data_config: ConfigDict = None,
         **format_kwargs,
     ):
         """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -350,6 +366,8 @@ class MsDataset:
             columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
                 preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
                 the output fields of processors will also be added.
+            task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
+            task_data_config (ConfigDict, default None): config dict for model object.
             format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.
 
         Returns:
@@ -360,6 +378,10 @@ class MsDataset:
             raise ImportError(
                 'The function to_torch_dataset requires pytorch to be installed'
             )
+        if isinstance(self._hf_ds, ExternalDataset):
+            task_data_config.update({'preprocessor': preprocessors})
+            return build_task_dataset(task_data_config, task_name,
+                                      self._hf_ds.config_kwargs)
         if preprocessors is not None:
             return self.to_torch_dataset_with_processors(
                 preprocessors, columns=columns)
diff --git a/modelscope/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py
similarity index 80%
rename from modelscope/task_datasets/__init__.py
rename to modelscope/msdatasets/task_datasets/__init__.py
index 93e01cb5..c80f8cd5 100644
--- a/modelscope/task_datasets/__init__.py
+++ b/modelscope/msdatasets/task_datasets/__init__.py
@@ -8,6 +8,7 @@ if TYPE_CHECKING:
     from .builder import TASK_DATASETS, build_task_dataset
     from .torch_base_dataset import TorchTaskDataset
     from .veco_dataset import VecoDataset
+    from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset
 
 else:
     _import_structure = {
@@ -15,6 +16,8 @@ else:
         'builder': ['TASK_DATASETS', 'build_task_dataset'],
         'torch_base_dataset': ['TorchTaskDataset'],
         'veco_dataset': ['VecoDataset'],
+        'image_instance_segmentation_coco_dataset':
+        ['ImageInstanceSegmentationCocoDataset']
     }
     import sys
 
diff --git a/modelscope/task_datasets/base.py b/modelscope/msdatasets/task_datasets/base.py
similarity index 100%
rename from modelscope/task_datasets/base.py
rename to modelscope/msdatasets/task_datasets/base.py
diff --git a/modelscope/task_datasets/builder.py b/modelscope/msdatasets/task_datasets/builder.py
similarity index 100%
rename from modelscope/task_datasets/builder.py
rename to modelscope/msdatasets/task_datasets/builder.py
diff --git a/modelscope/models/cv/image_instance_segmentation/datasets/dataset.py b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
similarity index 90%
rename from modelscope/models/cv/image_instance_segmentation/datasets/dataset.py
rename to modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
index d9e1b348..04c8e142 100644
--- a/modelscope/models/cv/image_instance_segmentation/datasets/dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -2,14 +2,32 @@ import os.path as osp
 
 import numpy as np
 from pycocotools.coco import COCO
-from torch.utils.data import Dataset
 
-
-class ImageInstanceSegmentationCocoDataset(Dataset):
+from modelscope.metainfo import Models
+from modelscope.utils.constant import Tasks
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+DATASET_STRUCTURE = {
+    'train': {
+        'annotation': 'annotations/instances_train.json',
+        'images': 'images/train'
+    },
+    'validation': {
+        'annotation': 'annotations/instances_val.json',
+        'images': 'images/val'
+    }
+}
+
+
+@TASK_DATASETS.register_module(
+    module_name=Models.cascade_mask_rcnn_swin,
+    group_key=Tasks.image_segmentation)
+class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
     """Coco-style dataset for image instance segmentation.
 
     Args:
-        ann_file (str): Annotation file path.
+        split_config (dict): Annotation file path. {"train":"xxxxx"}
         classes (Sequence[str], optional): Specify classes to load.
             If is None, ``cls.CLASSES`` will be used. Default: None.
         data_root (str, optional): Data root for ``ann_file``,
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
                'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
 
     def __init__(self,
-                 ann_file,
+                 split_config: dict,
+                 preprocessor=None,
                  classes=None,
-                 data_root=None,
-                 img_prefix='',
                  seg_prefix=None,
                  test_mode=False,
-                 filter_empty_gt=True):
-        self.ann_file = ann_file
-        self.data_root = data_root
-        self.img_prefix = img_prefix
+                 filter_empty_gt=True,
+                 **kwargs):
+        self.data_root = next(iter(split_config.values()))
+        self.split = next(iter(split_config.keys()))
+        self.preprocessor = preprocessor
+
+        self.ann_file = osp.join(self.data_root,
+                                 DATASET_STRUCTURE[self.split]['annotation'])
+
+        self.img_prefix = osp.join(self.data_root,
+                                   DATASET_STRUCTURE[self.split]['images'])
         self.seg_prefix = seg_prefix
         self.test_mode = test_mode
         self.filter_empty_gt = filter_empty_gt
         self.CLASSES = self.get_classes(classes)
 
-        # join paths if data_root is specified
-        if self.data_root is not None:
-            if not osp.isabs(self.ann_file):
-                self.ann_file = osp.join(self.data_root, self.ann_file)
-            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
-                self.img_prefix = osp.join(self.data_root, self.img_prefix)
-            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
-                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
-
         # load annotations
         self.data_infos = self.load_annotations(self.ann_file)
 
@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
             # set group flag for the sampler
             self._set_group_flag()
 
-        self.preprocessor = None
-
     def __len__(self):
         """Total number of samples of data."""
         return len(self.data_infos)
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
             raise ValueError(f'Unsupported type {type(classes)} of classes.')
 
         return class_names
-
-    def to_torch_dataset(self, preprocessors=None):
-        self.preprocessor = preprocessors
-        return self
diff --git a/modelscope/task_datasets/torch_base_dataset.py b/modelscope/msdatasets/task_datasets/torch_base_dataset.py
similarity index 100%
rename from modelscope/task_datasets/torch_base_dataset.py
rename to modelscope/msdatasets/task_datasets/torch_base_dataset.py
diff --git a/modelscope/task_datasets/veco_dataset.py b/modelscope/msdatasets/task_datasets/veco_dataset.py
similarity index 100%
rename from modelscope/task_datasets/veco_dataset.py
rename to modelscope/msdatasets/task_datasets/veco_dataset.py
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
index 2b4bad07..85489c58 100644
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo
 from datasets.packaged_modules import csv
 from datasets.utils.filelock import FileLock
 
+from modelscope.utils.constant import DownloadMode
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv):
         zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
         **config_kwargs,
     ):
+        self.namespace = namespace
         super().__init__(
             cache_dir=cache_dir,
             name=subset_name,
             hash=hash,
-            namespace=namespace,
             data_files=meta_data_files,
             **config_kwargs)
 
@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv):
                     os.rmdir(self._cache_dir)
         self.zip_data_files = zip_data_files
 
+    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
+        """Relative path of this dataset in cache_dir:
+        Will be:
+            self.name/self.config.version/self.hash/
+        or if a namespace has been specified:
+            self.namespace___self.name/self.config.version/self.hash/
+        """
+        builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
+        builder_config = self.config
+        hash = self.hash
+        if builder_config:
+            builder_data_dir = os.path.join(builder_data_dir, self.config_id)
+        if with_version:
+            builder_data_dir = os.path.join(builder_data_dir,
+                                            str(self.config.version))
+        if with_hash and hash and isinstance(hash, str):
+            builder_data_dir = os.path.join(builder_data_dir, hash)
+        return builder_data_dir
+
     def _build_cache_dir(self):
         builder_data_dir = os.path.join(
             self._cache_dir_root,
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv):
                 datasets.SplitGenerator(
                     name=split_name,
                     gen_kwargs={
-                        'files': dl_manager.iter_files(files),
-                        'base_dir': zip_data_files.get(split_name)
+                        'files':
+                        dl_manager.iter_files(files),
+                        'base_dir':
+                        os.path.join(
+                            zip_data_files.get(split_name),
+                            os.path.splitext(
+                                self.zip_data_files.get(split_name))[0])
+                        if self.zip_data_files.get(split_name) else
+                        zip_data_files.get(split_name)
                     }))
         return splits
 
@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv):
                 logger.error(
                     f"Failed to read file '{file}' with error {type(e)}: {e}")
                 raise
+
+
+class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
+
+    def __init__(
+        self,
+        dataset_name: str,
+        cache_dir: str,
+        namespace: str,
+        subset_name: str,
+        hash: str,
+        meta_data_files: Mapping[str, Union[str, Sequence[str]]],
+        zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
+        **config_kwargs,
+    ):
+        self.name = dataset_name
+        self.subset_name = subset_name
+        self.namespace = namespace
+        self.hash = hash
+        self.data_files = meta_data_files
+        self.zip_data_files = zip_data_files
+        self.split_path_dict = None
+        self.config = None
+        self._cache_dir_root = os.path.expanduser(cache_dir)
+        self._cache_dir = self._build_cache_dir()
+        self._config_kwargs = config_kwargs
+
+    def download_and_prepare(self, download_mode, dl_manager,
+                             **download_kwargs):
+        # Prevent parallel disk operations
+        lock_path = os.path.join(
+            self._cache_dir_root,
+            self._cache_dir.replace(os.sep, '_') + '.lock')
+        with FileLock(lock_path):
+            data_exists = os.path.exists(self._cache_dir)
+            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
+                logger.warning(
+                    f'Reusing dataset {self.name} ({self._cache_dir})')
+                return
+            logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
+        self._download_and_prepare(dl_manager=dl_manager)
+
+    def _download_and_prepare(self, dl_manager):
+        split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
+        self.split_path_dict = {
+            k: os.path.join(v,
+                            os.path.splitext(self.zip_data_files[k])[0])
+            for k, v in split_path_dict.items()
+        }
+
+    def as_dataset(self):
+        return ExternalDataset(self.split_path_dict, self._config_kwargs)
+
+
+class ExternalDataset(object):
+
+    def __init__(self, split_path_dict, config_kwargs):
+        config_kwargs.update({'split_config': split_path_dict})
+        self.config_kwargs = config_kwargs
+
+    def __len__(self):
+        return len(self.config_kwargs['split_config'])
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index ff7cd8b1..09556d84 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder
 
 from modelscope.utils.constant import DEFAULT_DATASET_REVISION
 from modelscope.utils.logger import get_logger
-from .dataset_builder import MsCsvDatasetBuilder
+from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 
 logger = get_logger()
 
@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict,
     modelscope_api = HubApi()
     for split, info in subset_split_into.items():
         meta_map[split] = modelscope_api.get_dataset_file_url(
-            info['meta'], dataset_name, namespace, revision)
+            info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
     return meta_map, file_map
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
                          zip_data_files: Mapping[str, Union[str,
                                                             Sequence[str]]],
                          cache_dir: str, version: Optional[Union[str]],
-                         split: Sequence[str]) -> DatasetBuilder:
+                         split: Sequence[str],
+                         **config_kwargs) -> DatasetBuilder:
     sub_dir = os.path.join(version, '_'.join(split))
-    builder_instance = MsCsvDatasetBuilder(
-        dataset_name=dataset_name,
-        namespace=namespace,
-        cache_dir=cache_dir,
-        subset_name=subset_name,
-        meta_data_files=meta_data_files,
-        zip_data_files=zip_data_files,
-        hash=sub_dir)
+    meta_data_file = next(iter(meta_data_files.values()))
+    if not meta_data_file:
+        builder_instance = TaskSpecificDatasetBuilder(
+            dataset_name=dataset_name,
+            namespace=namespace,
+            cache_dir=cache_dir,
+            subset_name=subset_name,
+            meta_data_files=meta_data_files,
+            zip_data_files=zip_data_files,
+            hash=sub_dir,
+            **config_kwargs)
+    elif meta_data_file.endswith('.csv'):
+        builder_instance = MsCsvDatasetBuilder(
+            dataset_name=dataset_name,
+            namespace=namespace,
+            cache_dir=cache_dir,
+            subset_name=subset_name,
+            meta_data_files=meta_data_files,
+            zip_data_files=zip_data_files,
+            hash=sub_dir)
+    else:
+        raise NotImplementedError(
+            f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
+        )
 
     return builder_instance
diff --git a/modelscope/outputs.py b/modelscope/outputs.py
index f279f311..200a03cd 100644
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -188,6 +188,16 @@ TASK_OUTPUTS = {
     Tasks.body_2d_keypoints:
     [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
 
+    # video single object tracking result for single video
+    # {
+    #   "boxes": [
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #               [x1, y1, x2, y2],
+    #             ]
+    # }
+    Tasks.video_single_object_tracking: [OutputKeys.BOXES],
+
     # live category recognition result for single video
     # {
     #       "scores": [0.885272, 0.014790631, 0.014558001],
@@ -405,7 +415,7 @@ TASK_OUTPUTS = {
 
     # audio processed for single file in PCM format
     # {
-    #   "output_pcm": np.array with shape(samples,) and dtype float32
+    #   "output_pcm": pcm encoded audio bytes
     # }
     Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
     Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
@@ -417,6 +427,19 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM],
 
+    # {
+    #     "kws_list": [
+    #         {
+    #             'keyword': '',        # the keyword spotted
+    #             'offset': 19.4,       # the keyword start time in second
+    #             'length': 0.68,       # the keyword length in second
+    #             'confidence': 0.85    # the possibility if it is the keyword
+    #         },
+    #         ...
+    #     ]
+    # }
+    Tasks.keyword_spotting: [OutputKeys.KWS_LIST],
+
     # ============ multi-modal tasks ===================
 
     # image caption result for single sample
diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py
index 562125b4..b46ca87e 100644
--- a/modelscope/pipelines/audio/__init__.py
+++ b/modelscope/pipelines/audio/__init__.py
@@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .ans_pipeline import ANSPipeline
     from .asr_inference_pipeline import AutomaticSpeechRecognitionPipeline
+    from .kws_farfield_pipeline import KWSFarfieldPipeline
     from .kws_kwsbp_pipeline import KeyWordSpottingKwsbpPipeline
     from .linear_aec_pipeline import LinearAECPipeline
     from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline
@@ -14,6 +15,7 @@ else:
     _import_structure = {
         'ans_pipeline': ['ANSPipeline'],
         'asr_inference_pipeline': ['AutomaticSpeechRecognitionPipeline'],
+        'kws_farfield_pipeline': ['KWSFarfieldPipeline'],
         'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'],
         'linear_aec_pipeline': ['LinearAECPipeline'],
         'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'],
diff --git a/modelscope/pipelines/audio/kws_farfield_pipeline.py b/modelscope/pipelines/audio/kws_farfield_pipeline.py
new file mode 100644
index 00000000..a114e7fb
--- /dev/null
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -0,0 +1,81 @@
+import io
+import wave
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.keyword_spotting,
+    module_name=Pipelines.speech_dfsmn_kws_char_farfield)
+class KWSFarfieldPipeline(Pipeline):
+    r"""A Keyword Spotting Inference Pipeline .
+
+    When invoke the class with pipeline.__call__(), it accept only one parameter:
+        inputs(str): the path of wav file
+    """
+    SAMPLE_RATE = 16000
+    SAMPLE_WIDTH = 2
+    INPUT_CHANNELS = 3
+    OUTPUT_CHANNELS = 2
+
+    def __init__(self, model, **kwargs):
+        """
+        use `model` to create a kws far field pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
+        self._nframe = self.model.size_in // frame_size
+        self.frame_count = 0
+
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        if isinstance(inputs, bytes):
+            return dict(input_file=inputs)
+        elif isinstance(inputs, Dict):
+            return inputs
+        else:
+            raise ValueError(f'Not supported input type: {type(inputs)}')
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        input_file = inputs['input_file']
+        if isinstance(input_file, bytes):
+            input_file = io.BytesIO(input_file)
+        self.frame_count = 0
+        kws_list = []
+        with wave.open(input_file, 'rb') as fin:
+            if 'output_file' in inputs:
+                with wave.open(inputs['output_file'], 'wb') as fout:
+                    fout.setframerate(self.SAMPLE_RATE)
+                    fout.setnchannels(self.OUTPUT_CHANNELS)
+                    fout.setsampwidth(self.SAMPLE_WIDTH)
+                    self._process(fin, kws_list, fout)
+            else:
+                self._process(fin, kws_list)
+        return {OutputKeys.KWS_LIST: kws_list}
+
+    def _process(self,
+                 fin: wave.Wave_read,
+                 kws_list,
+                 fout: wave.Wave_write = None):
+        data = fin.readframes(self._nframe)
+        while len(data) >= self.model.size_in:
+            self.frame_count += self._nframe
+            result = self.model.forward_decode(data)
+            if fout:
+                fout.writeframes(result['pcm'])
+            if 'kws' in result:
+                result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
+                kws_list.append(result['kws'])
+            data = fin.readframes(self._nframe)
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index b1d82557..041dfb34 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -255,7 +255,7 @@ class Pipeline(ABC):
                 return self._collate_fn(torch.from_numpy(data))
         elif isinstance(data, torch.Tensor):
             return data.to(self.device)
-        elif isinstance(data, (str, int, float, bool, type(None))):
+        elif isinstance(data, (bytes, str, int, float, bool, type(None))):
             return data
         elif isinstance(data, InputFeatures):
             return data
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 1066fa8d..4105e28b 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -124,12 +124,16 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.image_classification:
     (Pipelines.daily_image_classification,
      'damo/cv_vit-base_image-classification_Dailylife-labels'),
-    Tasks.ocr_recognition: (Pipelines.ocr_recognition,
-                            'damo/cv_convnextTiny_ocr-recognition_damo'),
+    Tasks.ocr_recognition:
+    (Pipelines.ocr_recognition,
+     'damo/cv_convnextTiny_ocr-recognition-general_damo'),
     Tasks.skin_retouching: (Pipelines.skin_retouching,
                             'damo/cv_unet_skin-retouching'),
     Tasks.crowd_counting: (Pipelines.crowd_counting,
                            'damo/cv_hrnet_crowd-counting_dcanet'),
+    Tasks.video_single_object_tracking:
+    (Pipelines.video_single_object_tracking,
+     'damo/cv_vitb_video-single-object-tracking_ostrack'),
 }
 
 
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 91a2f1e0..cee91c8e 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
     from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
     from .crowd_counting_pipeline import CrowdCountingPipeline
     from .image_detection_pipeline import ImageDetectionPipeline
+    from .image_salient_detection_pipeline import ImageSalientDetectionPipeline
     from .face_detection_pipeline import FaceDetectionPipeline
     from .face_image_generation_pipeline import FaceImageGenerationPipeline
     from .face_recognition_pipeline import FaceRecognitionPipeline
@@ -43,6 +44,7 @@ else:
         'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
         'crowd_counting_pipeline': ['CrowdCountingPipeline'],
         'image_detection_pipeline': ['ImageDetectionPipeline'],
+        'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'],
         'face_detection_pipeline': ['FaceDetectionPipeline'],
         'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
         'face_recognition_pipeline': ['FaceRecognitionPipeline'],
diff --git a/modelscope/pipelines/cv/image_salient_detection_pipeline.py b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
new file mode 100644
index 00000000..433275ba
--- /dev/null
+++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
@@ -0,0 +1,47 @@
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation, module_name=Pipelines.salient_detection)
+class ImageSalientDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        img_h, img_w, _ = img.shape
+        img = self.model.preprocess(img)
+        result = {'img': img, 'img_w': img_w, 'img_h': img_h}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input['img'])
+        result = {
+            'data': outputs,
+            'img_w': input['img_w'],
+            'img_h': input['img_h']
+        }
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        data = self.model.postprocess(inputs)
+        outputs = {
+            OutputKeys.SCORES: None,
+            OutputKeys.LABELS: None,
+            OutputKeys.MASKS: data
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
new file mode 100644
index 00000000..f4ba4d0b
--- /dev/null
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -0,0 +1,80 @@
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
+    cfg
+from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \
+    OSTrack
+from modelscope.models.cv.video_single_object_tracking.utils.utils import \
+    check_box
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_single_object_tracking,
+    module_name=Pipelines.video_single_object_tracking)
+class VideoSingleObjectTrackingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a single object tracking pipeline
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.cfg = cfg
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        self.tracker = OSTrack(ckpt_path, self.device)
+        logger.info('init tracker done')
+
+    def preprocess(self, input) -> Input:
+        self.video_path = input[0]
+        self.init_bbox = input[1]
+        return input
+
+    def forward(self, input: Input) -> Dict[str, Any]:
+        output_boxes = []
+        cap = cv2.VideoCapture(self.video_path)
+        success, frame = cap.read()
+        if success is False:
+            raise Exception(
+                'modelscope error: %s can not be decoded by OpenCV.' %
+                (self.video_path))
+
+        init_box = self.init_bbox
+        frame_h, frame_w = frame.shape[0:2]
+        if not check_box(init_box, frame_h, frame_w):
+            raise Exception('modelscope error: init_box out of image range ',
+                            init_box)
+        output_boxes.append(init_box.copy())
+        init_box[2] = init_box[2] - init_box[0]
+        init_box[3] = init_box[3] - init_box[1]
+        self.tracker.initialize(frame, {'init_bbox': init_box})
+        logger.info('init bbox done')
+
+        while True:
+            ret, frame = cap.read()
+            if frame is None:
+                break
+            out = self.tracker.track(frame)
+            state = [int(s) for s in out['target_bbox']]
+            output_boxes.append(state)
+        cap.release()
+        logger.info('tracking process done')
+
+        return {
+            OutputKeys.BOXES: output_boxes,
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index 2028e7dc..99cccee1 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -1,11 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
+import torch
+
 from modelscope.metainfo import Pipelines
-from modelscope.models.multi_modal import OfaForAllTasks
+from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
+                                      Preprocessor)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 
@@ -35,9 +39,19 @@ class ImageCaptioningPipeline(Pipeline):
         else:
             raise NotImplementedError
         pipe_model.model.eval()
-        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
-            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
+        if preprocessor is None:
+            if isinstance(pipe_model, OfaForAllTasks):
+                preprocessor = OfaPreprocessor(pipe_model.model_dir)
+            elif isinstance(pipe_model, MPlugForAllTasks):
+                preprocessor = MPlugPreprocessor(pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return inputs
+        if isinstance(self.model, OfaForAllTasks):
+            return inputs
+        return {OutputKeys.CAPTION: inputs}
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
index 9c694500..b2442a3e 100644
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -5,13 +5,12 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
-from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering,
-                                           OfaForAllTasks)
+from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor,
-                                      OfaPreprocessor)
+from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
+                                      Preprocessor)
 from modelscope.utils.constant import Tasks
 
 __all__ = ['VisualQuestionAnsweringPipeline']
@@ -23,9 +22,8 @@ __all__ = ['VisualQuestionAnsweringPipeline']
 class VisualQuestionAnsweringPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[MPlugForVisualQuestionAnswering, str],
-                 preprocessor: Optional[
-                     MPlugVisualQuestionAnsweringPreprocessor] = None,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a visual question answering pipeline for prediction
 
@@ -35,18 +33,12 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         """
         model = model if isinstance(model,
                                     Model) else Model.from_pretrained(model)
-        self.tokenizer = None
         if preprocessor is None:
             if isinstance(model, OfaForAllTasks):
                 preprocessor = OfaPreprocessor(model.model_dir)
-            elif isinstance(model, MPlugForVisualQuestionAnswering):
-                preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
-                    model.model_dir)
-        if isinstance(model, MPlugForVisualQuestionAnswering):
-            model.eval()
-            self.tokenizer = model.tokenizer
-        else:
-            model.model.eval()
+            elif isinstance(model, MPlugForAllTasks):
+                preprocessor = MPlugPreprocessor(model.model_dir)
+        model.model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def forward(self, inputs: Dict[str, Any],
@@ -64,14 +56,6 @@ class VisualQuestionAnsweringPipeline(Pipeline):
         Returns:
             Dict[str, str]: the prediction results
         """
-        if self.tokenizer is None:
+        if isinstance(self.model, OfaForAllTasks):
             return inputs
-        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
-                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
-                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
-
-        pred_string = self.tokenizer.decode(inputs[0][0])
-        for _old, _new in replace_tokens_bert:
-            pred_string = pred_string.replace(_old, _new)
-        pred_string.strip()
-        return {OutputKeys.TEXT: pred_string}
+        return {OutputKeys.TEXT: inputs}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 9a2adb04..0328b91a 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .base import Preprocessor
     from .builder import PREPROCESSORS, build_preprocessor
-    from .common import Compose
+    from .common import Compose, ToTensor, Filter
     from .asr import WavToScp
     from .audio import LinearAECAndFbank
     from .image import (LoadImage, load_image,
@@ -14,8 +14,7 @@ if TYPE_CHECKING:
                         ImageInstanceSegmentationPreprocessor,
                         ImageDenoisePreprocessor)
     from .kws import WavToLists
-    from .multi_modal import (OfaPreprocessor,
-                              MPlugVisualQuestionAnsweringPreprocessor)
+    from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
     from .nlp import (Tokenize, SequenceClassificationPreprocessor,
                       TextGenerationPreprocessor,
                       TokenClassificationPreprocessor,
@@ -33,7 +32,7 @@ else:
     _import_structure = {
         'base': ['Preprocessor'],
         'builder': ['PREPROCESSORS', 'build_preprocessor'],
-        'common': ['Compose'],
+        'common': ['Compose', 'ToTensor', 'Filter'],
         'audio': ['LinearAECAndFbank'],
         'asr': ['WavToScp'],
         'video': ['ReadVideoData'],
@@ -42,8 +41,7 @@ else:
             'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
         ],
         'kws': ['WavToLists'],
-        'multi_modal':
-        ['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'],
+        'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
         'nlp': [
             'Tokenize', 'SequenceClassificationPreprocessor',
             'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
diff --git a/modelscope/preprocessors/common.py b/modelscope/preprocessors/common.py
index 89fa859d..aa1db84c 100644
--- a/modelscope/preprocessors/common.py
+++ b/modelscope/preprocessors/common.py
@@ -2,6 +2,10 @@
 
 import time
 from collections.abc import Sequence
+from typing import Mapping
+
+import numpy as np
+import torch
 
 from .builder import PREPROCESSORS, build_preprocessor
 
@@ -25,12 +29,18 @@ class Compose(object):
             if isinstance(transform, dict):
                 if self.field_name is None:
                     transform = build_preprocessor(transform, field_name)
-                self.transforms.append(transform)
+                else:
+                    # if not found key in field_name, try field_name=None(default_group)
+                    try:
+                        transform = build_preprocessor(transform, field_name)
+                    except KeyError:
+                        transform = build_preprocessor(transform, None)
             elif callable(transform):
-                self.transforms.append(transform)
+                pass
             else:
                 raise TypeError('transform must be callable or a dict, but got'
                                 f' {type(transform)}')
+            self.transforms.append(transform)
 
     def __call__(self, data):
         for t in self.transforms:
@@ -52,3 +62,82 @@ class Compose(object):
             format_string += f'\n    {t}'
         format_string += '\n)'
         return format_string
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not isinstance(data, str):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PREPROCESSORS.register_module()
+class ToTensor(object):
+    """Convert target object to tensor.
+
+    Args:
+        keys (Sequence[str]): Key of data to be converted to Tensor.
+            Only valid when data is type of `Mapping`. If `keys` is None,
+            all values of keys ​​will be converted to tensor by default.
+    """
+
+    def __init__(self, keys=None):
+        self.keys = keys
+
+    def __call__(self, data):
+        if isinstance(data, Mapping):
+            if self.keys is None:
+                self.keys = list(data.keys())
+
+            for key in self.keys:
+                data[key] = to_tensor(data[key])
+        else:
+            data = to_tensor(data)
+
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PREPROCESSORS.register_module()
+class Filter(object):
+    """This is usually the last stage of the dataloader transform.
+    Only data of reserved keys will be kept and passed directly to the model, others will be removed.
+
+    Args:
+        keys (Sequence[str]): Keys of data to be reserved, others will be removed.
+    """
+
+    def __init__(self, reserved_keys):
+        self.reserved_keys = reserved_keys
+
+    def __call__(self, data):
+        assert isinstance(data, Mapping)
+
+        reserved_data = {}
+        for key in self.reserved_keys:
+            reserved_data[key] = data[key]
+
+        return reserved_data
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.reserved_keys})'
diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py
index 775514a2..6932371d 100644
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor):
         super().__init__(*args, **kwargs)
         self.model_dir: str = model_dir
 
+        from .common import Filter
+
+        # TODO: `Filter` should be moved to configurarion file of each model
+        self._transforms = [Filter(reserved_keys=['input', 'target'])]
+
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """process the raw input data
 
@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor):
         Returns:
             Dict[str, Any]: the preprocessed data
         """
+        for t in self._transforms:
+            data = t(data)
+
         return data
 
 
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 46648832..5046e166 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -19,7 +19,7 @@ from .ofa.utils.collate import collate_fn
 
 __all__ = [
     'OfaPreprocessor',
-    'MPlugVisualQuestionAnsweringPreprocessor',
+    'MPlugPreprocessor',
 ]
 
 
@@ -28,7 +28,7 @@ __all__ = [
 class OfaPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
@@ -102,39 +102,55 @@ class OfaPreprocessor(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.multi_modal,
-    module_name=Preprocessors.mplug_visual_question_answering)
-class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
+    Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
+class MPlugPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via 'bert-base-uncased' tokenizer and configuration
-
-        """
-        from transformers import BertTokenizer
-        from modelscope.models.multi_modal.mplug import CONFIG_NAME, VOCAB_NAME, MPlugConfig
-
         super().__init__(*args, **kwargs)
+        self.model_dir = model_dir
 
-        # tokenizer
-        self.tokenizer = BertTokenizer.from_pretrained(
-            osp.join(model_dir, VOCAB_NAME))
+        self._tokenizer = None
+        self._patch_resize_transform = None
 
-        # load configuration
-        config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME))
+    @property
+    def tokenizer(self):
+        from transformers import BertTokenizer
 
-        # Initialize transform
-        from torchvision import transforms
-        mean = (0.48145466, 0.4578275, 0.40821073)
-        std = (0.26862954, 0.26130258, 0.27577711)
+        if self._tokenizer is None:
+            self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
+        return self._tokenizer
+
+    @property
+    def patch_resize_transform(self):
+        if self._patch_resize_transform is None:
+            from torchvision import transforms
+            from modelscope.models.multi_modal.mplug import CONFIG_NAME, MPlugConfig
+
+            config = MPlugConfig.from_yaml_file(
+                osp.join(self.model_dir, CONFIG_NAME))
+
+            mean = (0.48145466, 0.4578275, 0.40821073)
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+            self._patch_resize_transform = transforms.Compose([
+                transforms.Resize((config.image_res, config.image_res),
+                                  interpolation=Image.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=mean, std=std),
+            ])
+        return self._patch_resize_transform
+
+    def __call__(self, *args, **kwargs):
+        call_mapping = {
+            Tasks.visual_question_answering: self.vqa_call,
+            Tasks.image_captioning: self.caption_call
+        }
 
-        self.patch_resize_transform = transforms.Compose([
-            transforms.Resize((config.image_res, config.image_res),
-                              interpolation=Image.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
-        ])
+        self.cfg = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION))
+        return call_mapping[self.cfg.task](*args, **kwargs)
 
-    def __call__(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
+    def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
         image: Image.Image = data[0] if isinstance(data,
                                                    tuple) else data['image']
         question: str = data[1] if isinstance(data,
@@ -147,3 +163,19 @@ class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
                                   return_tensors='pt')
 
         return {'image': image, 'question': question, 'train': False}
+
+    def caption_call(
+            self, data: Union[Image.Image, tuple,
+                              Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(data, Image.Image):
+            image = data
+        elif isinstance(data, tuple):
+            image = data[0]
+        else:
+            image = data['image']
+        image = image.convert('RGB')
+        image = self.patch_resize_transform(image)
+        image = torch.stack([image], dim=0)
+        question = self.tokenizer('', return_tensors='pt')
+
+        return {'image': image, 'question': question, 'train': False}
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index f231df9a..25576667 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -4,6 +4,7 @@ import os.path as osp
 import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union
 
+import numpy as np
 from transformers import AutoTokenizer
 
 from modelscope.metainfo import Models, Preprocessors
@@ -43,7 +44,7 @@ class Tokenize(Preprocessor):
 class SequenceClassificationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
             text_b,
             return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
             **self.tokenize_kwargs)
+        output = {
+            k: np.array(v) if isinstance(v, list) else v
+            for k, v in output.items()
+        }
         self.labels_to_id(labels, output)
         return output
 
@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
         if labels is not None:
             if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
                     and self.label2id is not None:
-                output[OutputKeys.LABEL] = [
+                output[OutputKeys.LABELS] = [
                     self.label2id[str(label)] for label in labels
                 ]
             elif label_can_be_mapped(labels) and self.label2id is not None:
-                output[OutputKeys.LABEL] = self.label2id[str(labels)]
+                output[OutputKeys.LABELS] = self.label2id[str(labels)]
             else:
-                output[OutputKeys.LABEL] = labels
+                output[OutputKeys.LABELS] = labels
 
 
 @PREPROCESSORS.register_module(
@@ -286,7 +291,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
     """
 
     def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
@@ -517,7 +522,7 @@ class NERPreprocessor(Preprocessor):
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
@@ -609,7 +614,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         from fairseq.data import Dictionary
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data via the vocab file from the `model_dir` path
 
         Args:
             model_dir (str): model path
diff --git a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
index c7339538..e2602eaa 100644
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
@@ -22,7 +22,7 @@ __all__ = ['DialogIntentPredictionPreprocessor']
 class DialogIntentPredictionPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
index 8ed97452..a2157c2b 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -20,7 +20,7 @@ __all__ = ['DialogModelingPreprocessor']
 class DialogModelingPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
diff --git a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
index 038ab09b..6eb17288 100644
--- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
@@ -17,7 +17,7 @@ __all__ = ['DialogStateTrackingPreprocessor']
 class DialogStateTrackingPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py
index f924588c..5bff360f 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -8,6 +8,7 @@ from itertools import chain
 import numpy as np
 
 from modelscope.preprocessors.space.tokenizer import Tokenizer
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology, utils
 from modelscope.utils.nlp.space.db_ops import MultiWozDB
@@ -343,7 +344,7 @@ class MultiWOZBPETextField(BPETextField):
         ]
         special_tokens.extend(self.add_sepcial_tokens())
         self.tokenizer = Tokenizer(
-            vocab_path=os.path.join(model_dir, 'vocab.txt'),
+            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
             special_tokens=special_tokens,
             tokenizer_type=config.BPETextField.tokenizer_type)
         self.understand_ids = self.tokenizer.convert_tokens_to_ids(
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py
index 4ed7ab6c..dc00e677 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -14,6 +14,7 @@ import numpy as np
 from tqdm import tqdm
 
 from modelscope.preprocessors.space.tokenizer import Tokenizer
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.nlp.space import ontology
 from modelscope.utils.nlp.space.scores import hierarchical_set_score
 from modelscope.utils.nlp.space.utils import list2np
@@ -50,7 +51,7 @@ class BPETextField(object):
         ]
         special_tokens.extend(self.add_sepcial_tokens())
         self.tokenizer = Tokenizer(
-            vocab_path=os.path.join(model_dir, 'vocab.txt'),
+            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
             special_tokens=special_tokens,
             tokenizer_type=config.BPETextField.tokenizer_type)
         self.understand_ids = self.numericalize(self.understand_tokens)
diff --git a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py b/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
index 2032dcf7..b5dd73a9 100644
--- a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
+++ b/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
@@ -28,7 +28,7 @@ __all__ = ['ConversationalTextToSqlPreprocessor']
 class ConversationalTextToSqlPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        """preprocess the data via the vocab.txt from the `model_dir` path
+        """preprocess the data
 
         Args:
             model_dir (str): model path
diff --git a/modelscope/preprocessors/star/fields/common_utils.py b/modelscope/preprocessors/star/fields/common_utils.py
index 2d33b7ab..431e66b6 100644
--- a/modelscope/preprocessors/star/fields/common_utils.py
+++ b/modelscope/preprocessors/star/fields/common_utils.py
@@ -193,6 +193,15 @@ class SubPreprocessor():
 
         from nltk import data
         data.path.append(os.path.join(self.model_dir, 'nltk_data'))
+
+        zippath = os.path.join(self.model_dir, 'nltk_data/tokenizers/punkt')
+        if os.path.exists(zippath):
+            print('punkt has already exist!')
+        else:
+            import zipfile
+            with zipfile.ZipFile(zippath + '.zip') as zf:
+                zf.extractall(
+                    os.path.join(self.model_dir, 'nltk_data/tokenizers/'))
         question = nltk.word_tokenize(question)
         question = mwtokenizer.tokenize(question)
 
diff --git a/modelscope/trainers/cv/image_instance_segmentation_trainer.py b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
index e7632147..2e2415dc 100644
--- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py
+++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer):
 
     def prediction_step(self, model, inputs):
         pass
-
-    def to_task_dataset(self, datasets, mode, preprocessor=None):
-        # wait for dataset interface to become stable...
-        return datasets.to_torch_dataset(preprocessor)
diff --git a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py
index 7ef0de79..0941d1cd 100644
--- a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py
+++ b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py
@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer):
 
         train_outputs = dict()
         self._mode = ModeKeys.TRAIN
-        inputs = self.collate_fn(inputs)
         # call model forward but not __call__ to skip postprocess
         if isinstance(inputs, Mapping):
             d_loss = model._train_forward_d(**inputs)
diff --git a/modelscope/trainers/hooks/hook.py b/modelscope/trainers/hooks/hook.py
index 3a58557b..75cc226c 100644
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -192,7 +192,7 @@ class Hook:
         Whether to reach the end of every epoch
         Returns: bool
         """
-        return trainer.inner_iter + 1 == len(trainer.data_loader)
+        return trainer.inner_iter + 1 == trainer.iters_per_epoch
 
     def is_last_epoch(self, trainer):
         """
diff --git a/modelscope/trainers/hooks/logger/text_logger_hook.py b/modelscope/trainers/hooks/logger/text_logger_hook.py
index a204284c..6629a0c9 100644
--- a/modelscope/trainers/hooks/logger/text_logger_hook.py
+++ b/modelscope/trainers/hooks/logger/text_logger_hook.py
@@ -93,7 +93,7 @@ class TextLoggerHook(LoggerHook):
                 lr_str = f'{lr_key}: {log_dict[lr_key]:.3e}'
 
             if self.by_epoch:
-                log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{len(trainer.data_loader)}]\t'
+                log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{trainer.iters_per_epoch}]\t'
             else:
                 log_str = f'{iter_key} [{log_dict[iter_key]}/{trainer.max_iters}]\t'
             log_str += f'{lr_str}, '
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 322070a1..3692b486 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         self.train_keys = build_dataset_keys(
             self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
             and hasattr(self.cfg.dataset, 'train') else None)
-        # TODO eval may has special keys, which is now not supported.
-        # because there is only one preprocessor in the trainer, and it only supports one group of keys.
-        self.eval_keys = self.train_keys
+        self.eval_keys = build_dataset_keys(
+            self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
+            and hasattr(self.cfg.dataset, 'val') else None)
+        if len(self.eval_keys) == 0:
+            self.eval_keys = self.train_keys
 
         super().__init__(
             model=model_dir,
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         elif isinstance(model, nn.Module):
             return model
 
-    def build_preprocessor(self) -> Preprocessor:
+    def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
         """Build the preprocessor.
 
         User can override this method to implement custom logits.
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         model_args = {} if self.label2id is None else {
             'label2id': self.label2id
         }
-        cfg = ConfigDict({
-            **getattr(self.cfg, 'preprocessor'),
-            'model_dir':
-            self.model_dir,
-            **model_args,
-            'mode':
-            ModeKeys.TRAIN,
-            **self.train_keys,
-        })
-        return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
+
+        field_name = Tasks.find_field_by_task(self.cfg.task)
+        train_preprocessor, eval_preprocessor = None, None
+        _train_cfg, _eval_cfg = {}, {}
+
+        if 'type' not in self.cfg.preprocessor and (
+                'train' in self.cfg.preprocessor
+                or 'val' in self.cfg.preprocessor):
+            if 'train' in self.cfg.preprocessor:
+                _train_cfg = self.cfg.preprocessor.train
+            if 'val' in self.cfg.preprocessor:
+                _eval_cfg = self.cfg.preprocessor.val
+        else:
+            _train_cfg = self.cfg.preprocessor
+            _eval_cfg = self.cfg.preprocessor
+
+        if len(_train_cfg):
+            _train_cfg.update({
+                'model_dir': self.model_dir,
+                **model_args,
+                **self.train_keys, 'mode': ModeKeys.TRAIN
+            })
+            train_preprocessor = build_preprocessor(_train_cfg, field_name)
+        if len(_eval_cfg):
+            _eval_cfg.update({
+                'model_dir': self.model_dir,
+                **model_args,
+                **self.eval_keys, 'mode': ModeKeys.EVAL
+            })
+            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
+
+        return train_preprocessor, eval_preprocessor
 
 
 @TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)
@@ -178,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer):
         """Veco evaluates the datasets one by one.
 
         """
-        from modelscope.task_datasets import VecoDataset
+        from modelscope.msdatasets.task_datasets import VecoDataset
         self.model.eval()
         self._mode = ModeKeys.EVAL
         metric_values = {}
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index a96c186c..0916495c 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -5,15 +5,15 @@ import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
 from functools import partial
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import json
 import numpy as np
 import torch
-from addict import Dict
 from torch import distributed as dist
 from torch import nn
 from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler
 
 from modelscope.hub.snapshot_download import snapshot_download
@@ -21,23 +21,26 @@ from modelscope.metainfo import Trainers
 from modelscope.metrics import build_metric, task_default_metrics
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
-from modelscope.preprocessors import build_preprocessor
+from modelscope.msdatasets.task_datasets.builder import build_task_dataset
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
 from modelscope.preprocessors.base import Preprocessor
-from modelscope.task_datasets.builder import build_task_dataset
-from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
+from modelscope.preprocessors.builder import build_preprocessor
+from modelscope.preprocessors.common import Compose
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
-                                       ModelFile, Tasks, TrainerStages)
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
+                                       ConfigKeys, Hubs, ModeKeys, ModelFile,
+                                       Tasks, TrainerStages)
+from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
-from modelscope.utils.tensor_utils import torch_default_data_collator
-from modelscope.utils.torch_utils import (broadcast, create_device,
-                                          get_dist_info, init_dist)
+from modelscope.utils.torch_utils import (create_device, get_dist_info,
+                                          init_dist)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -83,7 +86,8 @@ class EpochBasedTrainer(BaseTrainer):
             data_collator: Optional[Callable] = None,
             train_dataset: Optional[Union[MsDataset, Dataset]] = None,
             eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
-            preprocessor: Optional[Preprocessor] = None,
+            preprocessor: Optional[Union[Preprocessor,
+                                         Dict[str, Preprocessor]]] = None,
             optimizers: Tuple[torch.optim.Optimizer,
                               torch.optim.lr_scheduler._LRScheduler] = (None,
                                                                         None),
@@ -120,24 +124,46 @@ class EpochBasedTrainer(BaseTrainer):
         else:
             self.work_dir = self.cfg.train.get('work_dir', './work_dir')
 
-        self.preprocessor = None
+        self.train_preprocessor, self.eval_preprocessor = None, None
         if isinstance(preprocessor, Preprocessor):
-            self.preprocessor = preprocessor
-        elif hasattr(self.cfg, 'preprocessor'):
-            self.preprocessor = self.build_preprocessor()
-        if self.preprocessor is not None:
-            self.preprocessor.mode = ModeKeys.TRAIN
+            self.train_preprocessor = preprocessor
+            self.eval_preprocessor = preprocessor
+        elif isinstance(preprocessor, Mapping):
+            if not (ConfigKeys.train in preprocessor
+                    or ConfigKeys.val in preprocessor):
+                raise ValueError(
+                    f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
+                )
+            if ConfigKeys.train in preprocessor:
+                assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
+                self.train_preprocessor = preprocessor[ConfigKeys.train]
+            if ConfigKeys.val in preprocessor:
+                assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
+                self.eval_preprocessor = preprocessor[ConfigKeys.val]
+        elif hasattr(self.cfg, ConfigFields.preprocessor):
+            self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
+            )
+
+        if self.train_preprocessor is not None:
+            self.train_preprocessor.mode = ModeKeys.TRAIN
+        if self.eval_preprocessor is not None:
+            self.eval_preprocessor.mode = ModeKeys.EVAL
+
         device_name = kwargs.get('device', 'gpu')
         assert device_name in ['gpu',
                                'cpu'], 'device should be either cpu or gpu.'
         self.device = create_device(device_name == 'cpu')
 
         self.train_dataset = self.to_task_dataset(
-            train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
+            train_dataset,
+            mode=ModeKeys.TRAIN,
+            preprocessor=self.train_preprocessor)
         self.eval_dataset = self.to_task_dataset(
-            eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
+            eval_dataset,
+            mode=ModeKeys.EVAL,
+            preprocessor=self.eval_preprocessor)
 
-        self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
+        self.data_collator = data_collator if data_collator is not None else default_collate
         self.metrics = self.get_metrics()
         self._metric_values = None
         self.optimizers = optimizers
@@ -155,6 +181,16 @@ class EpochBasedTrainer(BaseTrainer):
         else:
             self._max_epochs = kwargs['max_epochs']
 
+        self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
+        self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
+        if self._train_iters_per_epoch is None and hasattr(
+                self.cfg.train, 'train_iters_per_epoch'):
+            self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch
+        if self._eval_iters_per_epoch is None and hasattr(
+                self.cfg, 'evaluation') and hasattr(self.cfg.evaluation,
+                                                    'val_iters_per_epoch'):
+            self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch
+
         self.use_fp16 = kwargs.get('use_fp16', False)
 
         # TODO @wenmeng.zwm add seed init fn
@@ -211,7 +247,32 @@ class EpochBasedTrainer(BaseTrainer):
     @property
     def max_iters(self):
         """int: Maximum training iterations."""
-        return self._max_epochs * len(self.data_loader)
+        return self._max_epochs * self.iters_per_epoch
+
+    @property
+    def iters_per_epoch(self):
+        """int: Total iterations of one epoch"""
+
+        def _get_data_len(data_loader):
+            try:
+                return len(data_loader)
+            except Exception as e:
+                self.logger.error(e)
+                raise ValueError(
+                    'Please implement ``__len__`` method for your dataset, '
+                    'or add `train_iters_per_epoch` and `train_iters_per_epoch` '
+                    'to your configuration file or kwargs')
+
+        if self.mode == ModeKeys.TRAIN:
+            if self._train_iters_per_epoch is not None:
+                return self._train_iters_per_epoch
+            else:
+                return _get_data_len(self.train_dataloader)
+        elif self.mode == ModeKeys.EVAL:
+            if self._eval_iters_per_epoch is not None:
+                return self._eval_iters_per_epoch
+            else:
+                return _get_data_len(self.eval_dataloader)
 
     def to_task_dataset(self,
                         datasets: Union[Dataset, List[Dataset]],
@@ -228,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer):
             if isinstance(datasets, TorchTaskDataset):
                 return datasets
             elif isinstance(datasets, MsDataset):
-                datasets = datasets.to_torch_dataset(
-                    preprocessors=self.preprocessor)
-                return datasets
+                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
+                    else ConfigDict(type=None, mode=mode)
+                return datasets.to_torch_dataset(
+                    task_data_config=cfg,
+                    task_name=self.cfg.task,
+                    preprocessors=preprocessor)
             elif isinstance(datasets, List) and isinstance(
                     datasets[0], MsDataset):
+                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
+                    else ConfigDict(type=None, mode=mode)
                 datasets = [
-                    d.to_torch_dataset(preprocessor=self.preprocessor)
-                    for d in datasets
+                    d.to_torch_dataset(
+                        task_data_config=cfg,
+                        task_name=self.cfg.task,
+                        preprocessors=preprocessor) for d in datasets
                 ]
                 cfg = ConfigDict(
                     type=self.cfg.task, mode=mode, datasets=datasets)
@@ -258,24 +326,44 @@ class EpochBasedTrainer(BaseTrainer):
             else:
                 return datasets
 
-    def build_preprocessor(self) -> Preprocessor:
-        """Build the preprocessor.
+    def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
+        """Build train and eval preprocessor.
 
         User can override this method to implement custom logits.
 
-        Returns: The preprocessor instance.
+        Returns: The train preprocessor and eval preprocessor instance.
 
         """
-        # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
-        # when they are different ones in training and evaluation
-        cfg = ConfigDict({
-            **getattr(self.cfg, 'preprocessor'),
-            'model_dir':
-            self.model_dir,
-            'mode':
-            ModeKeys.TRAIN,
-        })
-        return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
+        field_name = Tasks.find_field_by_task(self.cfg.task)
+        train_preprocessor, eval_preprocessor = None, None
+        _train_cfg, _eval_cfg = {}, {}
+        _dafault_args = {'model_dir': self.model_dir}
+
+        if 'type' not in self.cfg.preprocessor and (
+                'train' in self.cfg.preprocessor
+                or 'val' in self.cfg.preprocessor):
+            if 'train' in self.cfg.preprocessor:
+                _train_cfg = self.cfg.preprocessor.train
+            if 'val' in self.cfg.preprocessor:
+                _eval_cfg = self.cfg.preprocessor.val
+        else:
+            _train_cfg = self.cfg.preprocessor
+            _eval_cfg = self.cfg.preprocessor
+
+        if len(_train_cfg):
+            if isinstance(_train_cfg, Sequence):
+                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
+                # and add mode for Compose or other plans
+                raise NotImplementedError('Not supported yet!')
+            _train_cfg.update(_dafault_args)
+            train_preprocessor = build_preprocessor(_train_cfg, field_name)
+        if len(_eval_cfg):
+            if isinstance(_eval_cfg, Sequence):
+                raise NotImplementedError('Not supported yet!')
+            _eval_cfg.update(_dafault_args)
+            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)
+
+        return train_preprocessor, eval_preprocessor
 
     def get_metrics(self) -> List[str]:
         """Get the metric class types.
@@ -373,34 +461,6 @@ class EpochBasedTrainer(BaseTrainer):
 
         return build_parallel(dp_cfg)
 
-    def collate_fn(self, data):
-        """Prepare the input just before the forward function.
-        This method will move the tensors to the right device.
-        Usually this method does not need to be overridden.
-
-        Args:
-            data: The data out of the dataloader.
-
-        Returns: The processed data.
-
-        """
-        from torch.utils.data.dataloader import default_collate
-        if isinstance(data, dict) or isinstance(data, Mapping):
-            return type(data)({k: self.collate_fn(v) for k, v in data.items()})
-        elif isinstance(data, (tuple, list)):
-            if isinstance(data[0], (int, float)):
-                return default_collate(data).to(self.device)
-            else:
-                return type(data)(self.collate_fn(v) for v in data)
-        elif isinstance(data, np.ndarray):
-            return self.collate_fn(torch.from_numpy(data))
-        elif isinstance(data, torch.Tensor):
-            return data.to(self.device)
-        elif isinstance(data, (str, int, float, bool)):
-            return data
-        else:
-            raise ValueError(f'Unsupported data type {type(data)}')
-
     def train_step(self, model, inputs):
         """ Perform a training step on a batch of inputs.
 
@@ -421,7 +481,6 @@ class EpochBasedTrainer(BaseTrainer):
         # TODO: find more pretty way to change mode
         model.train()
         self._mode = ModeKeys.TRAIN
-        inputs = self.collate_fn(inputs)
         # call model forward but not __call__ to skip postprocess
         if isinstance(inputs,
                       Mapping) and not func_receive_dict_inputs(model.forward):
@@ -486,7 +545,9 @@ class EpochBasedTrainer(BaseTrainer):
         if self.train_dataset is None:
             train_data = self.cfg.dataset.train
             self.train_dataset = self.build_dataset(
-                train_data, mode=ModeKeys.TRAIN)
+                train_data,
+                mode=ModeKeys.TRAIN,
+                preprocessor=self.train_preprocessor)
 
         data_loader = self._build_dataloader_with_dataset(
             self.train_dataset,
@@ -505,7 +566,9 @@ class EpochBasedTrainer(BaseTrainer):
         if self.eval_dataset is None:
             val_data = self.cfg.dataset.val
             self.eval_dataset = self.build_dataset(
-                val_data, mode=ModeKeys.EVAL)
+                val_data,
+                mode=ModeKeys.EVAL,
+                preprocessor=self.eval_preprocessor)
 
         batch_size = self.cfg.evaluation.batch_size
         workers = self.cfg.evaluation.workers
@@ -521,7 +584,7 @@ class EpochBasedTrainer(BaseTrainer):
         )
         return data_loader
 
-    def build_dataset(self, data_cfg, mode):
+    def build_dataset(self, data_cfg, mode, preprocessor=None):
         """ Build torch dataset object using data config
         """
         dataset = MsDataset.load(
@@ -530,9 +593,13 @@ class EpochBasedTrainer(BaseTrainer):
             subset_name=data_cfg.subset_name if hasattr(
                 data_cfg, 'subset_name') else None,
             hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
+            **data_cfg,
         )
+        cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
         torch_dataset = dataset.to_torch_dataset(
-            preprocessors=self.preprocessor, )
+            task_data_config=cfg,
+            task_name=self.cfg.task,
+            preprocessors=self.preprocessor)
         dataset = self.to_task_dataset(torch_dataset, mode)
         return dataset
 
@@ -698,6 +765,7 @@ class EpochBasedTrainer(BaseTrainer):
             self.invoke_hook(TrainerStages.before_train_epoch)
             time.sleep(2)  # Prevent possible deadlock during epoch transition
             for i, data_batch in enumerate(data_loader):
+                data_batch = to_device(data_batch, self.device)
                 self.data_batch = data_batch
                 self._inner_iter = i
                 self.invoke_hook(TrainerStages.before_train_iter)
@@ -706,6 +774,9 @@ class EpochBasedTrainer(BaseTrainer):
                 del self.data_batch
                 self._iter += 1
 
+                if i + 1 >= self.iters_per_epoch:
+                    break
+
             self.invoke_hook(TrainerStages.after_train_epoch)
             self._epoch += 1
 
@@ -721,17 +792,21 @@ class EpochBasedTrainer(BaseTrainer):
             metric_values = multi_gpu_test(
                 self.model,
                 data_loader,
+                device=self.device,
                 tmpdir=None,
                 gpu_collect=False,
-                data_collate_fn=self.collate_fn,
-                metric_classes=metric_classes)
+                metric_classes=metric_classes,
+                data_loader_iters_per_gpu=self.iters_per_epoch)
         else:
             from modelscope.trainers.utils.inference import single_gpu_test
             metric_values = single_gpu_test(
                 self.model,
                 data_loader,
-                data_collate_fn=self.collate_fn,
-                metric_classes=metric_classes)
+                device=self.device,
+                metric_classes=metric_classes,
+                data_loader_iters=self.iters_per_epoch)
+
+        self._inner_iter = self.iters_per_epoch - 1  # start from index 0
 
         return metric_values
 
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index a90a58b6..d368c340 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
 import os
 import pickle
 import shutil
@@ -10,6 +11,7 @@ import torch
 from torch import distributed as dist
 from tqdm import tqdm
 
+from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
                                           make_tmp_dir)
@@ -17,25 +19,41 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
 
 def single_gpu_test(model,
                     data_loader,
-                    data_collate_fn=None,
-                    metric_classes=None):
+                    device,
+                    metric_classes=None,
+                    data_loader_iters=None):
     """Test model with a single gpu.
 
     Args:
         model (nn.Module): Model to be tested.
         data_loader (nn.Dataloader): Pytorch data loader.
-        data_collate_fn: An optional data_collate_fn before fed into the model
-        metric_classes(List): List of Metric class that uses to collect metrics
+        device (str | torch.device): The target device for the data.
+        metric_classes (List): List of Metric class that uses to collect metrics
+        data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset.
 
     Returns:
         list: The prediction results.
     """
     model.eval()
     dataset = data_loader.dataset
-    with tqdm(total=len(dataset), desc='test samples') as pbar:
-        for data in data_loader:
-            if data_collate_fn is not None:
-                data = data_collate_fn(data)
+    progress_with_iters = False
+    if data_loader_iters is None:
+        try:
+            data_len = len(dataset)
+        except Exception as e:
+            logging.error(e)
+            raise ValueError(
+                'Please implement ``__len__`` method for your dataset, or provide ``data_loader_iters``'
+            )
+        desc = 'Total test samples'
+    else:
+        progress_with_iters = True
+        data_len = data_loader_iters
+        desc = 'Test iterations'
+
+    with tqdm(total=data_len, desc=desc) as pbar:
+        for i, data in enumerate(data_loader):
+            data = to_device(data, device)
             with torch.no_grad():
                 if isinstance(data, Mapping) and not func_receive_dict_inputs(
                         model.forward):
@@ -46,13 +64,19 @@ def single_gpu_test(model,
                 for metric_cls in metric_classes:
                     metric_cls.add(result, data)
 
-            if isinstance(data, dict):
-                batch_size = len(next(iter(data.values())))
+            if progress_with_iters:
+                batch_size = 1  # iteration count
             else:
-                batch_size = len(data)
+                if isinstance(data, dict):
+                    batch_size = len(next(iter(data.values())))
+                else:
+                    batch_size = len(data)
             for _ in range(batch_size):
                 pbar.update()
 
+            if progress_with_iters and (i + 1) >= data_len:
+                break
+
     metric_values = {}
     for metric_cls in metric_classes:
         metric_values.update(metric_cls.evaluate())
@@ -62,10 +86,11 @@ def single_gpu_test(model,
 
 def multi_gpu_test(model,
                    data_loader,
+                   device,
                    tmpdir=None,
                    gpu_collect=False,
-                   data_collate_fn=None,
-                   metric_classes=None):
+                   metric_classes=None,
+                   data_loader_iters_per_gpu=None):
     """Test model with multiple gpus.
 
     This method tests model with multiple gpus and collects the results
@@ -77,12 +102,12 @@ def multi_gpu_test(model,
     Args:
         model (nn.Module): Model to be tested.
         data_loader (nn.Dataloader): Pytorch data loader.
+        device: (str | torch.device): The target device for the data.
         tmpdir (str): Path of directory to save the temporary results from
             different gpus under cpu mode.
         gpu_collect (bool): Option to use either gpu or cpu to collect results.
-        data_collate_fn: An optional data_collate_fn before fed into the model
         metric_classes(List): List of Metric class that uses to collect metrics
-
+        data_loader_iters_per_gpu (int): Used when dataset has no attribute __len__ or only load part of dataset.
     Returns:
         list: The prediction results.
     """
@@ -90,16 +115,31 @@ def multi_gpu_test(model,
     results = []
     data_list = []
     dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
 
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    progress_with_iters = False
+    if data_loader_iters_per_gpu is None:
+        try:
+            data_len = len(dataset)
+            total_samples = data_len
+        except Exception as e:
+            logging.error(e)
+            raise ValueError(
+                'Please implement ``__len__`` method for your dataset, or provide ``data_loader_iters_per_gpu``'
+            )
+        desc = 'Total test samples with multi gpus'
+    else:
+        total_samples = 0
+        progress_with_iters = True
+        data_len = data_loader_iters_per_gpu * world_size
+        desc = 'Total test iterations with multi gpus'
 
-    rank, world_size = get_dist_info()
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
 
     count = 0
-    with tqdm(total=len(dataset), desc='test samples with multi gpus') as pbar:
-        for _, data in enumerate(data_loader):
-            if data_collate_fn is not None:
-                data = data_collate_fn(data)
+    with tqdm(total=data_len, desc=desc) as pbar:
+        for i, data in enumerate(data_loader):
+            data = to_device(data, device)
             data_list.append(data)
             with torch.no_grad():
                 if isinstance(data, Mapping) and not func_receive_dict_inputs(
@@ -114,24 +154,32 @@ def multi_gpu_test(model,
                     batch_size = len(next(iter(data.values())))
                 else:
                     batch_size = len(data)
+
+                if progress_with_iters:
+                    total_samples += batch_size * world_size
+                    batch_size = 1  # iteration count
+
                 batch_size_all = batch_size * world_size
                 count += batch_size_all
-                if count > len(dataset):
-                    batch_size_all = len(dataset) - (count - batch_size_all)
+                if count > data_len:
+                    batch_size_all = data_len - (count - batch_size_all)
                 for _ in range(batch_size_all):
                     pbar.update()
 
+            if progress_with_iters and (i + 1) >= data_len:
+                break
+
     # TODO: allgather data list may cost a lot of memory and needs to be redesigned
     # collect results and data from all ranks
     if gpu_collect:
-        results = collect_results_gpu(results, len(dataset))
-        data_list = collect_results_gpu(data_list, len(dataset))
+        results = collect_results_gpu(results, total_samples)
+        data_list = collect_results_gpu(data_list, total_samples)
     else:
         if tmpdir is None:
             tmpdir = make_tmp_dir()
-        results = collect_results_cpu(results, len(dataset),
+        results = collect_results_cpu(results, total_samples,
                                       os.path.join(tmpdir, 'predict'))
-        data_list = collect_results_cpu(data_list, len(dataset),
+        data_list = collect_results_cpu(data_list, total_samples,
                                         os.path.join(tmpdir, 'groundtruth'))
 
     if is_master():
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index a86dfbc2..759bd447 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -30,8 +30,8 @@ MODELSCOPE_PATH = '/'.join(os.path.dirname(__file__).split('/')[:-1])
 REGISTER_MODULE = 'register_module'
 IGNORED_PACKAGES = ['modelscope', '.']
 SCAN_SUB_FOLDERS = [
-    'models', 'metrics', 'pipelines', 'preprocessors', 'task_datasets',
-    'trainers'
+    'models', 'metrics', 'pipelines', 'preprocessors',
+    'msdatasets/task_datasets', 'trainers'
 ]
 INDEXER_FILE = 'ast_indexer'
 DECORATOR_KEY = 'decorators'
@@ -43,6 +43,7 @@ MD5_KEY = 'md5'
 INDEX_KEY = 'index'
 REQUIREMENT_KEY = 'requirements'
 MODULE_KEY = 'module'
+CLASS_NAME = 'class_name'
 
 
 class AstScaning(object):
@@ -237,6 +238,8 @@ class AstScaning(object):
                                                    ['name']] = final_dict
 
                     if 'decorator_list' == field and attr != []:
+                        for item in attr:
+                            setattr(item, CLASS_NAME, node.name)
                         self.result_decorator.extend(attr)
 
                     out += f'{indentstr()}{field}={representation},\n'
@@ -294,7 +297,7 @@ class AstScaning(object):
         else:
             return getattr(eval(split_list[0]), split_list[1])
 
-    def _registry_indexer(self, parsed_input: tuple) -> tuple:
+    def _registry_indexer(self, parsed_input: tuple, class_name: str) -> tuple:
         """format registry information to a tuple indexer
 
         Return:
@@ -310,7 +313,7 @@ class AstScaning(object):
         if len(args_list) == 0 and len(keyword_list) == 0:
             args_list.append(default_group)
         if len(keyword_list) == 0 and len(args_list) == 1:
-            args_list.append(None)
+            args_list.append(class_name)
         if len(keyword_list) == 1 and len(args_list) == 0:
             args_list.append(default_group)
 
@@ -344,7 +347,8 @@ class AstScaning(object):
             if type(node).__name__ != 'Call':
                 continue
             parse_output = self._parse_decorator(node)
-            index = self._registry_indexer(parse_output)
+            index = self._registry_indexer(parse_output,
+                                           getattr(node, CLASS_NAME))
             if None is not index:
                 results.append(index)
         return results
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 927eafbd..1a3fb7c3 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -62,6 +62,9 @@ class CVTasks(object):
     virtual_try_on = 'virtual-try-on'
     crowd_counting = 'crowd-counting'
 
+    # video related
+    video_single_object_tracking = 'video-single-object-tracking'
+
 
 class NLPTasks(object):
     # nlp tasks
@@ -203,6 +206,8 @@ class ModelFile(object):
     TF_CKPT_PREFIX = 'ckpt-'
     TORCH_MODEL_FILE = 'pytorch_model.pt'
     TORCH_MODEL_BIN_FILE = 'pytorch_model.bin'
+    VOCAB_FILE = 'vocab.txt'
+    ONNX_MODEL_FILE = 'model.onnx'
     LABEL_MAPPING = 'label_mapping.json'
 
 
@@ -219,6 +224,12 @@ class ConfigFields(object):
     evaluation = 'evaluation'
 
 
+class ConfigKeys(object):
+    """Fixed keywords in configuration file"""
+    train = 'train'
+    val = 'val'
+
+
 class Requirements(object):
     """Requirement names for each module
     """
diff --git a/modelscope/utils/data_utils.py b/modelscope/utils/data_utils.py
new file mode 100644
index 00000000..2bc88e19
--- /dev/null
+++ b/modelscope/utils/data_utils.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from collections.abc import Mapping
+
+import torch
+
+
+def to_device(batch, device, non_blocking=False):
+    """Put the data to the target cuda device just before the forward function.
+    Args:
+        batch: The batch data out of the dataloader.
+        device: (str | torch.device): The target device for the data.
+
+    Returns: The data to the target device.
+
+    """
+    if isinstance(batch, dict) or isinstance(batch, Mapping):
+        return type(batch)({k: to_device(v, device) for k, v in batch.items()})
+    elif isinstance(batch, (tuple, list)):
+        return type(batch)(to_device(v, device) for v in batch)
+    elif isinstance(batch, torch.Tensor):
+        return batch.to(device, non_blocking=non_blocking)
+    else:
+        return batch
diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py
index aca103d2..7889d944 100644
--- a/modelscope/utils/tensor_utils.py
+++ b/modelscope/utils/tensor_utils.py
@@ -24,65 +24,3 @@ def torch_nested_detach(tensors):
     if isinstance(tensors, torch.Tensor):
         return tensors.detach()
     return tensors
-
-
-def torch_default_data_collator(features):
-    # TODO @jiangnana.jnn refine this default data collator
-    import torch
-    first = features[0]
-
-    if isinstance(first, Mapping):
-        batch = {}
-        # Special handling for labels.
-        # Ensure that tensor is created with the correct type
-        # (it should be automatically the case, but let's make sure of it.)
-        if 'label' in first and first['label'] is not None:
-            label = first['label'].item() if isinstance(
-                first['label'], torch.Tensor) else first['label']
-            # the msdataset return a 0-dimension np.array with a single value, the following part handle this.
-            if isinstance(label, np.ndarray):
-                src_dtype = label[()].dtype
-                dtype = torch.long if label[(
-                )].dtype == np.int64 else torch.float
-            else:
-                src_dtype = type(label)
-                dtype = torch.long if isinstance(label, int) else torch.float
-            # add dtype to np.array to fix "TypeError: can't convert np.ndarray of type numpy.object_"
-            batch['labels'] = torch.tensor(
-                np.array([f['label'] for f in features], dtype=src_dtype),
-                dtype=dtype)
-        elif 'label_ids' in first and first['label_ids'] is not None:
-            if isinstance(first['label_ids'], torch.Tensor):
-                batch['labels'] = torch.stack(
-                    [f['label_ids'] for f in features])
-            else:
-                dtype = torch.long if type(
-                    first['label_ids'][0]) is int else torch.float
-                batch['labels'] = torch.tensor(
-                    [f['label_ids'] for f in features], dtype=dtype)
-
-        # Handling of all other possible keys.
-        # Again, we will use the first element to figure out which key/values are not None for this model.
-        for k, v in first.items():
-            if k not in ('label', 'label_ids'
-                         ) and v is not None and not isinstance(v, str):
-                if isinstance(v, torch.Tensor):
-                    batch[k] = torch.stack([f[k] for f in features])
-                elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
-                    batch[k] = torch.stack([d for f in features for d in f[k]])
-                else:
-                    batch[k] = torch.tensor(np.array([f[k] for f in features]))
-    elif isinstance(first, tuple):
-        batch = []
-        for idx in range(len(first)):
-            if isinstance(first[idx], torch.Tensor):
-                batch.append(torch.stack([f[idx] for f in features]))
-            else:
-                batch.append(torch.tensor([f[idx] for f in features]))
-    else:
-        if isinstance(first, torch.Tensor):
-            batch = torch.stack(features)
-        else:
-            batch = torch.tensor(features)
-
-    return batch
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 5a606f9c..7adba982 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -50,7 +50,7 @@ def set_test_level(level: int):
 
 def create_dummy_test_dataset(feat, label, num):
     return MsDataset.from_hf_dataset(
-        Dataset.from_dict(dict(feat=[feat] * num, label=[label] * num)))
+        Dataset.from_dict(dict(feat=[feat] * num, labels=[label] * num)))
 
 
 def download_and_untar(fpath, furl, dst) -> str:
diff --git a/modelscope/version.py b/modelscope/version.py
index bfeb9e74..40ed83d9 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.3.4'
+__version__ = '0.3.5'
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 81d288bd..5e4bc104 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -16,6 +16,7 @@ numpy<=1.18
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
 protobuf>3,<3.21.0
 ptflops
+py_sound_connect
 pytorch_wavelets
 PyWavelets>=1.0.0
 scikit-learn
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index ce18dcea..e2b78f06 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,6 +1,5 @@
 addict
-#version above 2.1.0 introduces backward-compatability issue which is being resolved
-datasets==2.1.0
+datasets
 easydict
 einops
 filelock>=3.3.0
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 0894ce3d..f9118353 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -4,6 +4,7 @@ from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.preprocessors.base import Preprocessor
+from modelscope.utils.constant import DownloadMode
 from modelscope.utils.test_utils import require_tf, require_torch, test_level
 
 
@@ -30,6 +31,16 @@ class ImgPreprocessor(Preprocessor):
 
 class MsDatasetTest(unittest.TestCase):
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_coco(self):
+        ms_ds_train = MsDataset.load(
+            'pets_small',
+            namespace='modelscope',
+            split='train',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD,
+            classes=('1', '2'))
+        print(ms_ds_train._hf_ds.config_kwargs)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ms_csv_basic(self):
         ms_ds_train = MsDataset.load(
diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py
new file mode 100644
index 00000000..e7967edc
--- /dev/null
+++ b/tests/pipelines/test_key_word_spotting_farfield.py
@@ -0,0 +1,43 @@
+import os.path
+import unittest
+
+from modelscope.fileio import File
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+TEST_SPEECH_FILE = 'data/test/audios/3ch_nihaomiya.wav'
+
+
+class KWSFarfieldTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_normal(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)}
+        result = kws(inputs)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_output(self):
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        inputs = {
+            'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE),
+            'output_file': 'output.wav'
+        }
+        result = kws(inputs)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_input_bytes(self):
+        with open(os.path.join(os.getcwd(), TEST_SPEECH_FILE), 'rb') as f:
+            data = f.read()
+        kws = pipeline(Tasks.keyword_spotting, model=self.model_id)
+        result = kws(data)
+        self.assertEqual(len(result['kws_list']), 5)
+        print(result['kws_list'][-1])
diff --git a/tests/pipelines/test_mplug_tasks.py b/tests/pipelines/test_mplug_tasks.py
new file mode 100644
index 00000000..4b8a813a
--- /dev/null
+++ b/tests/pipelines/test_mplug_tasks.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from PIL import Image
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MplugTasksTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_image_captioning_with_model(self):
+        model = Model.from_pretrained(
+            'damo/mplug_image-captioning_coco_base_en')
+        pipeline_caption = pipeline(
+            task=Tasks.image_captioning,
+            model=model,
+        )
+        image = Image.open('data/test/images/image_mplug_vqa.jpg')
+        result = pipeline_caption({'image': image})
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_image_captioning_with_name(self):
+        pipeline_caption = pipeline(
+            Tasks.image_captioning,
+            model='damo/mplug_image-captioning_coco_base_en')
+        image = Image.open('data/test/images/image_mplug_vqa.jpg')
+        result = pipeline_caption({'image': image})
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_visual_question_answering_with_model(self):
+        model = Model.from_pretrained(
+            'damo/mplug_visual-question-answering_coco_large_en')
+        pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
+        image = Image.open('data/test/images/image_mplug_vqa.jpg')
+        question = 'What is the woman doing?'
+        input = {'image': image, 'question': question}
+        result = pipeline_vqa(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_visual_question_answering_with_name(self):
+        model = 'damo/mplug_visual-question-answering_coco_large_en'
+        pipeline_vqa = pipeline(Tasks.visual_question_answering, model=model)
+        image = Image.open('data/test/images/image_mplug_vqa.jpg')
+        question = 'What is the woman doing?'
+        input = {'image': image, 'question': question}
+        result = pipeline_vqa(input)
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py
index 3bf3af87..6152f279 100644
--- a/tests/pipelines/test_multi_modal_embedding.py
+++ b/tests/pipelines/test_multi_modal_embedding.py
@@ -2,50 +2,58 @@
 
 import unittest
 
-import numpy as np
+import torch
 
 from modelscope.models import Model
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 
 class MultiModalEmbeddingTest(unittest.TestCase):
-    model_id = 'damo/multi-modal_clip-vit-large-patch14_zh'
-    test_text = {'text': '一张风景图'}
+    model_id = 'damo/multi-modal_clip-vit-base-patch16_zh'
+    test_input = {'text': '皮卡丘'}
+    model_version = 'dev'
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run(self):
-        pipe_line_multi_modal_embedding = pipeline(
-            Tasks.multi_modal_embedding, model=self.model_id)
-        test_str_embedding = pipe_line_multi_modal_embedding(
-            self.test_text)['text_embedding']
-        print(np.sum(np.abs(test_str_embedding)))
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+        pipeline_multi_modal_embedding = pipeline(
+            Tasks.multi_modal_embedding,
+            model=self.model_id,
+            model_revision=self.model_version)
+        text_embedding = pipeline_multi_modal_embedding(
+            self.test_input)[OutputKeys.TEXT_EMBEDDING]
+        print('l1-norm: {}'.format(
+            torch.norm(text_embedding, p=1, dim=-1).item()))
+        print('l2-norm: {}'.format(torch.norm(text_embedding,
+                                              dim=-1).item()))  # should be 1.0
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
-        pipe_line_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding, model=model)
-        test_str_embedding = pipe_line_multi_modal_embedding(
-            self.test_text)['text_embedding']
-        print(np.sum(np.abs(test_str_embedding)))
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_model_name(self):
-        pipe_line_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding, model=self.model_id)
-        test_str_embedding = pipe_line_multi_modal_embedding(
-            self.test_text)['text_embedding']
-        print(np.sum(np.abs(test_str_embedding)))
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+        pipeline_multi_modal_embedding = pipeline(
+            task=Tasks.multi_modal_embedding,
+            model=model,
+            model_revision=self.model_version)
+        text_embedding = pipeline_multi_modal_embedding(
+            self.test_input)[OutputKeys.TEXT_EMBEDDING]
+        print('l1-norm: {}'.format(
+            torch.norm(text_embedding, p=1, dim=-1).item()))
+        print('l2-norm: {}'.format(torch.norm(text_embedding,
+                                              dim=-1).item()))  # should be 1.0
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipe_line_multi_modal_embedding = pipeline(
-            task=Tasks.multi_modal_embedding)
-        test_str_embedding = pipe_line_multi_modal_embedding(
-            self.test_text)['text_embedding']
-        print(np.sum(np.abs(test_str_embedding)))
+        pipeline_multi_modal_embedding = pipeline(
+            task=Tasks.multi_modal_embedding,
+            model_revision=self.model_version)
+        text_embedding = pipeline_multi_modal_embedding(
+            self.test_input)[OutputKeys.TEXT_EMBEDDING]
+        print('l1-norm: {}'.format(
+            torch.norm(text_embedding, p=1, dim=-1).item()))
+        print('l2-norm: {}'.format(torch.norm(text_embedding,
+                                              dim=-1).item()))  # should be 1.0
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_ocr_recognition.py b/tests/pipelines/test_ocr_recognition.py
index d86c2266..a2e5ba8e 100644
--- a/tests/pipelines/test_ocr_recognition.py
+++ b/tests/pipelines/test_ocr_recognition.py
@@ -19,7 +19,7 @@ from modelscope.utils.test_utils import test_level
 class OCRRecognitionTest(unittest.TestCase):
 
     def setUp(self) -> None:
-        self.model_id = 'damo/cv_convnextTiny_ocr-recognition_damo'
+        self.model_id = 'damo/cv_convnextTiny_ocr-recognition-general_damo'
         self.test_image = 'data/test/images/ocr_recognition.jpg'
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
diff --git a/tests/pipelines/test_salient_detection.py b/tests/pipelines/test_salient_detection.py
new file mode 100644
index 00000000..ec010b17
--- /dev/null
+++ b/tests/pipelines/test_salient_detection.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SalientDetectionTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_salient_detection(self):
+        input_location = 'data/test/images/image_salient_detection.jpg'
+        model_id = 'damo/cv_u2net_salient-detection'
+        salient_detect = pipeline(Tasks.image_segmentation, model=model_id)
+        result = salient_detect(input_location)
+        import cv2
+        # result[OutputKeys.MASKS] is salient map result,other keys are not used
+        cv2.imwrite(input_location + '_salient.jpg', result[OutputKeys.MASKS])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index e8b4a551..007e6c73 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -8,22 +8,10 @@ from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
-NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
-FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
-NEAREND_MIC_FILE = 'nearend_mic.wav'
-FAREND_SPEECH_FILE = 'farend_speech.wav'
+NEAREND_MIC_FILE = 'data/test/audios/nearend_mic.wav'
+FAREND_SPEECH_FILE = 'data/test/audios/farend_speech.wav'
 
-NOISE_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ANS/sample_audio/speech_with_noise.wav'
-NOISE_SPEECH_FILE = 'speech_with_noise.wav'
-
-
-def download(remote_path, local_path):
-    local_dir = os.path.dirname(local_path)
-    if len(local_dir) > 0:
-        if not os.path.exists(local_dir):
-            os.makedirs(local_dir)
-    with open(local_path, 'wb') as ofile:
-        ofile.write(File.read(remote_path))
+NOISE_SPEECH_FILE = 'data/test/audios/speech_with_noise.wav'
 
 
 class SpeechSignalProcessTest(unittest.TestCase):
@@ -33,13 +21,10 @@ class SpeechSignalProcessTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_aec(self):
-        # Download audio files
-        download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
-        download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
         input = {
-            'nearend_mic': NEAREND_MIC_FILE,
-            'farend_speech': FAREND_SPEECH_FILE
+            'nearend_mic': os.path.join(os.getcwd(), NEAREND_MIC_FILE),
+            'farend_speech': os.path.join(os.getcwd(), FAREND_SPEECH_FILE)
         }
         aec = pipeline(Tasks.acoustic_echo_cancellation, model=model_id)
         output_path = os.path.abspath('output.wav')
@@ -48,14 +33,11 @@ class SpeechSignalProcessTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec_bytes(self):
-        # Download audio files
-        download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
-        download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
         input = {}
-        with open(NEAREND_MIC_FILE, 'rb') as f:
+        with open(os.path.join(os.getcwd(), NEAREND_MIC_FILE), 'rb') as f:
             input['nearend_mic'] = f.read()
-        with open(FAREND_SPEECH_FILE, 'rb') as f:
+        with open(os.path.join(os.getcwd(), FAREND_SPEECH_FILE), 'rb') as f:
             input['farend_speech'] = f.read()
         aec = pipeline(
             Tasks.acoustic_echo_cancellation,
@@ -67,13 +49,10 @@ class SpeechSignalProcessTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_aec_tuple_bytes(self):
-        # Download audio files
-        download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
-        download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
         model_id = 'damo/speech_dfsmn_aec_psm_16k'
-        with open(NEAREND_MIC_FILE, 'rb') as f:
+        with open(os.path.join(os.getcwd(), NEAREND_MIC_FILE), 'rb') as f:
             nearend_bytes = f.read()
-        with open(FAREND_SPEECH_FILE, 'rb') as f:
+        with open(os.path.join(os.getcwd(), FAREND_SPEECH_FILE), 'rb') as f:
             farend_bytes = f.read()
         inputs = (nearend_bytes, farend_bytes)
         aec = pipeline(
@@ -86,25 +65,22 @@ class SpeechSignalProcessTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_ans(self):
-        # Download audio files
-        download(NOISE_SPEECH_URL, NOISE_SPEECH_FILE)
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
         ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
         output_path = os.path.abspath('output.wav')
-        ans(NOISE_SPEECH_FILE, output_path=output_path)
+        ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE),
+            output_path=output_path)
         print(f'Processed audio saved to {output_path}')
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ans_bytes(self):
-        # Download audio files
-        download(NOISE_SPEECH_URL, NOISE_SPEECH_FILE)
         model_id = 'damo/speech_frcrn_ans_cirm_16k'
         ans = pipeline(
             Tasks.acoustic_noise_suppression,
             model=model_id,
             pipeline_name=Pipelines.speech_frcrn_ans_cirm_16k)
         output_path = os.path.abspath('output.wav')
-        with open(NOISE_SPEECH_FILE, 'rb') as f:
+        with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), 'rb') as f:
             data = f.read()
             ans(data, output_path=output_path)
         print(f'Processed audio saved to {output_path}')
diff --git a/tests/pipelines/test_video_single_object_tracking.py b/tests/pipelines/test_video_single_object_tracking.py
new file mode 100644
index 00000000..f5d4714c
--- /dev/null
+++ b/tests/pipelines/test_video_single_object_tracking.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models.cv.video_single_object_tracking.utils.utils import \
+    show_tracking_result
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SingleObjectTracking(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_vitb_video-single-object-tracking_ostrack'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_end2end(self):
+        video_single_object_tracking = pipeline(
+            Tasks.video_single_object_tracking, model=self.model_id)
+        video_path = 'data/test/videos/dog.avi'
+        init_bbox = [414, 343, 514, 449]  # [x1, y1, x2, y2]
+        result = video_single_object_tracking((video_path, init_bbox))
+        print('result is : ', result[OutputKeys.BOXES])
+        show_tracking_result(video_path, result[OutputKeys.BOXES],
+                             './tracking_result.avi')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_single_object_tracking = pipeline(
+            Tasks.video_single_object_tracking)
+        video_path = 'data/test/videos/dog.avi'
+        init_bbox = [414, 343, 514, 449]  # [x1, y1, x2, y2]
+        result = video_single_object_tracking((video_path, init_bbox))
+        print('result is : ', result[OutputKeys.BOXES])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_visual_question_answering.py b/tests/pipelines/test_visual_question_answering.py
deleted file mode 100644
index 748a86b9..00000000
--- a/tests/pipelines/test_visual_question_answering.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import unittest
-
-from PIL import Image
-
-from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.models import Model
-from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering
-from modelscope.pipelines import pipeline
-from modelscope.pipelines.multi_modal import VisualQuestionAnsweringPipeline
-from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor
-from modelscope.utils.constant import Tasks
-from modelscope.utils.test_utils import test_level
-
-
-class VisualQuestionAnsweringTest(unittest.TestCase):
-
-    def setUp(self):
-        self.model_id = 'damo/mplug_visual-question-answering_coco_large_en'
-        self.input_vqa = {
-            'image': Image.open('data/test/images/image_mplug_vqa.jpg'),
-            'question': 'What is the woman doing?',
-        }
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run(self):
-        cache_path = snapshot_download(self.model_id)
-        preprocessor = MPlugVisualQuestionAnsweringPreprocessor(cache_path)
-        model = MPlugForVisualQuestionAnswering(cache_path)
-        pipeline1 = VisualQuestionAnsweringPipeline(
-            model, preprocessor=preprocessor)
-        pipeline2 = pipeline(
-            Tasks.visual_question_answering,
-            model=model,
-            preprocessor=preprocessor)
-        print(f"question: {self.input_vqa['question']}")
-        print(f'pipeline1: {pipeline1(self.input_vqa)}')
-        print(f'pipeline2: {pipeline2(self.input_vqa)}')
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_model_from_modelhub(self):
-        model = Model.from_pretrained(self.model_id)
-        preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
-            model.model_dir)
-        pipeline_vqa = pipeline(
-            task=Tasks.visual_question_answering,
-            model=model,
-            preprocessor=preprocessor)
-        print(pipeline_vqa(self.input_vqa))
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_model_name(self):
-        pipeline_vqa = pipeline(
-            Tasks.visual_question_answering, model=self.model_id)
-        print(pipeline_vqa(self.input_vqa))
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run_with_default_model(self):
-        pipeline_vqa = pipeline(task=Tasks.visual_question_answering)
-        print(pipeline_vqa(self.input_vqa))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/preprocessors/test_common.py b/tests/preprocessors/test_common.py
index 1ee13589..714b8588 100644
--- a/tests/preprocessors/test_common.py
+++ b/tests/preprocessors/test_common.py
@@ -2,7 +2,10 @@
 
 import unittest
 
-from modelscope.preprocessors import PREPROCESSORS, Compose, Preprocessor
+import torch
+
+from modelscope.preprocessors import (PREPROCESSORS, Compose, Filter,
+                                      Preprocessor, ToTensor)
 
 
 class ComposeTest(unittest.TestCase):
@@ -35,5 +38,27 @@ class ComposeTest(unittest.TestCase):
         self.assertEqual(output['tmp2'], 'tmp2')
 
 
+class ToTensorTest(unittest.TestCase):
+
+    def test_totensor(self):
+        to_tensor_op = ToTensor(keys=['img'])
+        inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
+        inputs = to_tensor_op(inputs)
+        self.assertIsInstance(inputs['img'], torch.Tensor)
+        self.assertEqual(inputs['label'], 1)
+        self.assertEqual(inputs['path'], 'test.jpg')
+
+
+class FilterTest(unittest.TestCase):
+
+    def test_filter(self):
+        filter_op = Filter(reserved_keys=['img', 'label'])
+        inputs = {'img': [1, 2, 3], 'label': 1, 'path': 'test.jpg'}
+        inputs = filter_op(inputs)
+        self.assertIn('img', inputs)
+        self.assertIn('label', inputs)
+        self.assertNotIn('path', inputs)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index fca01597..4271e201 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -2,7 +2,7 @@
 
 import unittest
 
-from modelscope.preprocessors import build_preprocessor
+from modelscope.preprocessors import build_preprocessor, nlp
 from modelscope.utils.constant import Fields, InputFields
 from modelscope.utils.logger import get_logger
 
diff --git a/tests/taskdataset/test_veco_dataset.py b/tests/taskdataset/test_veco_dataset.py
index fc59750d..76da1681 100644
--- a/tests/taskdataset/test_veco_dataset.py
+++ b/tests/taskdataset/test_veco_dataset.py
@@ -2,7 +2,7 @@
 
 import unittest
 
-from modelscope.task_datasets.veco_dataset import VecoDataset
+from modelscope.msdatasets.task_datasets.veco_dataset import VecoDataset
 from modelscope.utils.test_utils import test_level
 
 
diff --git a/tests/trainers/hooks/test_evaluation_hook.py b/tests/trainers/hooks/test_evaluation_hook.py
index 9e65f127..1338bb2c 100644
--- a/tests/trainers/hooks/test_evaluation_hook.py
+++ b/tests/trainers/hooks/test_evaluation_hook.py
@@ -12,7 +12,7 @@ from torch import nn
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
 from modelscope.trainers import build_trainer
-from modelscope.utils.constant import LogKeys, ModelFile
+from modelscope.utils.constant import ModelFile
 from modelscope.utils.registry import default_group
 from modelscope.utils.test_utils import create_dummy_test_dataset
 
diff --git a/tests/trainers/hooks/test_lr_scheduler_hook.py b/tests/trainers/hooks/test_lr_scheduler_hook.py
index eb30fb52..86d53ecc 100644
--- a/tests/trainers/hooks/test_lr_scheduler_hook.py
+++ b/tests/trainers/hooks/test_lr_scheduler_hook.py
@@ -9,7 +9,7 @@ import numpy as np
 import torch
 from torch import nn
 from torch.optim import SGD
-from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
+from torch.optim.lr_scheduler import MultiStepLR
 
 from modelscope.metainfo import Trainers
 from modelscope.metrics.builder import METRICS, MetricKeys
@@ -96,7 +96,8 @@ class LrSchedulerHookTest(unittest.TestCase):
             model=model,
             train_dataset=dummy_dataset,
             optimizers=(optimizer, lr_scheduler),
-            max_epochs=5)
+            max_epochs=5,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
@@ -160,15 +161,13 @@ class LrSchedulerHookTest(unittest.TestCase):
             json.dump(json_cfg, f)
 
         model = DummyModel()
-        # optimmizer = SGD(model.parameters(), lr=0.01)
-        # lr_scheduler = MultiStepLR(optimmizer, milestones=[2, 4])
         trainer_name = Trainers.default
         kwargs = dict(
             cfg_file=config_path,
             model=model,
             train_dataset=dummy_dataset,
-            # optimizers=(optimmizer, lr_scheduler),
-            max_epochs=7)
+            max_epochs=7,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
@@ -266,7 +265,8 @@ class PlateauLrSchedulerHookTest(unittest.TestCase):
             train_dataset=dummy_dataset,
             eval_dataset=dummy_dataset,
             optimizers=(optimizer, None),
-            max_epochs=5)
+            max_epochs=5,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
diff --git a/tests/trainers/hooks/test_optimizer_hook.py b/tests/trainers/hooks/test_optimizer_hook.py
index 62c70632..25457c1c 100644
--- a/tests/trainers/hooks/test_optimizer_hook.py
+++ b/tests/trainers/hooks/test_optimizer_hook.py
@@ -17,7 +17,7 @@ from modelscope.utils.constant import ModelFile, TrainerStages
 from modelscope.utils.test_utils import create_dummy_test_dataset
 
 dummy_dataset = create_dummy_test_dataset(
-    np.random.random(size=(2, 2)), np.random.randint(0, 2, (1, )), 10)
+    np.random.random(size=(2, )), np.random.randint(0, 2, (1, )), 10)
 
 
 class DummyModel(nn.Module):
@@ -71,7 +71,8 @@ class OptimizerHookTest(unittest.TestCase):
             model=model,
             train_dataset=dummy_dataset,
             optimizers=(optimizer, lr_scheduler),
-            max_epochs=2)
+            max_epochs=2,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
diff --git a/tests/trainers/hooks/test_timer_hook.py b/tests/trainers/hooks/test_timer_hook.py
index 6f24809b..614f7688 100644
--- a/tests/trainers/hooks/test_timer_hook.py
+++ b/tests/trainers/hooks/test_timer_hook.py
@@ -75,7 +75,8 @@ class IterTimerHookTest(unittest.TestCase):
             model=model,
             train_dataset=dummy_dataset,
             optimizers=(optimizer, lr_scheduler),
-            max_epochs=5)
+            max_epochs=5,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         train_dataloader = trainer._build_dataloader_with_dataset(
@@ -83,6 +84,7 @@ class IterTimerHookTest(unittest.TestCase):
         trainer.register_optimizers_hook()
         trainer.register_hook_from_cfg(trainer.cfg.train.hooks)
         trainer.data_loader = train_dataloader
+        trainer.train_dataloader = train_dataloader
         trainer.invoke_hook(TrainerStages.before_run)
         for i in range(trainer._epoch, trainer._max_epochs):
             trainer.invoke_hook(TrainerStages.before_train_epoch)
diff --git a/tests/trainers/test_clip_multi_modal_embedding_trainer.py b/tests/trainers/test_clip_multi_modal_embedding_trainer.py
deleted file mode 100644
index 03f82854..00000000
--- a/tests/trainers/test_clip_multi_modal_embedding_trainer.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import tempfile
-import unittest
-
-import requests
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-
-from modelscope.hub.snapshot_download import snapshot_download
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from modelscope.utils.constant import ModelFile
-from modelscope.utils.logger import get_logger
-from modelscope.utils.test_utils import test_level
-
-logger = get_logger()
-
-
-def clip_train_worker(local_rank, ngpus, node_size, node_rank):
-    global_rank = local_rank + node_rank * ngpus
-    dist_world_size = node_size * ngpus
-
-    dist.init_process_group(
-        backend='nccl', world_size=dist_world_size, rank=global_rank)
-
-    model_id = 'damo/multi-modal_clip-vit-large-patch14_zh'
-    local_model_dir = snapshot_download(model_id)
-
-    default_args = dict(
-        cfg_file='{}/{}'.format(local_model_dir, ModelFile.CONFIGURATION),
-        model=model_id,
-        device_id=local_rank)
-    trainer = build_trainer(
-        name=Trainers.clip_multi_modal_embedding, default_args=default_args)
-
-    trainer.train()
-    trainer.evaluate()
-
-
-class CLIPMultiModalEmbeddingTrainerTest(unittest.TestCase):
-
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-    def test_trainer(self):
-        os.environ['MASTER_ADDR'] = '127.0.0.1'
-        os.environ['MASTER_PORT'] = '2001'
-        NODE_SIZE, NODE_RANK = 1, 0
-        logger.info('Train clip with {} machines'.format(NODE_SIZE))
-        ngpus = torch.cuda.device_count()
-        logger.info('Machine: {} has {} GPUs'.format(NODE_RANK, ngpus))
-        mp.spawn(
-            clip_train_worker,
-            nprocs=ngpus,
-            args=(ngpus, NODE_SIZE, NODE_RANK))
-        logger.info('Training done')
-
-
-if __name__ == '__main__':
-    unittest.main()
-    ...
diff --git a/tests/trainers/test_image_instance_segmentation_trainer.py b/tests/trainers/test_image_instance_segmentation_trainer.py
index 35d0378f..c8557ff5 100644
--- a/tests/trainers/test_image_instance_segmentation_trainer.py
+++ b/tests/trainers/test_image_instance_segmentation_trainer.py
@@ -8,10 +8,13 @@ from functools import partial
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Trainers
-from modelscope.models.cv.image_instance_segmentation import (
-    CascadeMaskRCNNSwinModel, ImageInstanceSegmentationCocoDataset)
+from modelscope.models.cv.image_instance_segmentation import \
+    CascadeMaskRCNNSwinModel
+from modelscope.msdatasets import MsDataset
+from modelscope.msdatasets.task_datasets import \
+    ImageInstanceSegmentationCocoDataset
 from modelscope.trainers import build_trainer
-from modelscope.utils.config import Config
+from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.test_utils import test_level
 
@@ -27,34 +30,47 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
         cfg = Config.from_file(config_path)
 
-        data_root = cfg.dataset.data_root
-        classes = tuple(cfg.dataset.classes)
         max_epochs = cfg.train.max_epochs
         samples_per_gpu = cfg.train.dataloader.batch_size_per_gpu
-
-        if data_root is None:
+        try:
+            train_data_cfg = cfg.dataset.train
+            val_data_cfg = cfg.dataset.val
+        except Exception:
+            train_data_cfg = None
+            val_data_cfg = None
+        if train_data_cfg is None:
             # use default toy data
-            dataset_path = os.path.join(cache_path, 'toydata.zip')
-            with zipfile.ZipFile(dataset_path, 'r') as zipf:
-                zipf.extractall(cache_path)
-            data_root = cache_path + '/toydata/'
-            classes = ('Cat', 'Dog')
-
-        self.train_dataset = ImageInstanceSegmentationCocoDataset(
-            data_root + 'annotations/instances_train.json',
-            classes=classes,
-            data_root=data_root,
-            img_prefix=data_root + 'images/train/',
-            seg_prefix=None,
-            test_mode=False)
-
-        self.eval_dataset = ImageInstanceSegmentationCocoDataset(
-            data_root + 'annotations/instances_val.json',
-            classes=classes,
-            data_root=data_root,
-            img_prefix=data_root + 'images/val/',
-            seg_prefix=None,
-            test_mode=True)
+            train_data_cfg = ConfigDict(
+                name='pets_small',
+                split='train',
+                classes=('Cat', 'Dog'),
+                test_mode=False)
+        if val_data_cfg is None:
+            val_data_cfg = ConfigDict(
+                name='pets_small',
+                split='validation',
+                classes=('Cat', 'Dog'),
+                test_mode=True)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            classes=train_data_cfg.classes,
+            test_mode=train_data_cfg.test_mode)
+        assert self.train_dataset.config_kwargs[
+            'classes'] == train_data_cfg.classes
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.eval_dataset = MsDataset.load(
+            dataset_name=val_data_cfg.name,
+            split=val_data_cfg.split,
+            classes=val_data_cfg.classes,
+            test_mode=val_data_cfg.test_mode)
+        assert self.eval_dataset.config_kwargs[
+            'classes'] == val_data_cfg.classes
+        assert next(
+            iter(self.eval_dataset.config_kwargs['split_config'].values()))
 
         from mmcv.parallel import collate
 
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index b7639024..0259f804 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -3,23 +3,31 @@ import os
 import shutil
 import tempfile
 import unittest
-from abc import ABCMeta
 
 import json
 import numpy as np
 import torch
-from datasets import Dataset
 from torch import nn
 from torch.optim import SGD
 from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
-from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import create_dummy_test_dataset, test_level
 
+
+class DummyIterableDataset(IterableDataset):
+
+    def __iter__(self):
+        feat = np.random.random(size=(5, )).astype(np.float32)
+        labels = np.random.randint(0, 4, (1, ))
+        iterations = [{'feat': feat, 'labels': labels}] * 500
+        return iter(iterations)
+
+
 dummy_dataset_small = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
@@ -116,7 +124,8 @@ class TrainerTest(unittest.TestCase):
             data_collator=None,
             train_dataset=dummy_dataset_small,
             eval_dataset=dummy_dataset_small,
-            max_epochs=3)
+            max_epochs=3,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
@@ -175,7 +184,8 @@ class TrainerTest(unittest.TestCase):
             train_dataset=dummy_dataset_small,
             eval_dataset=dummy_dataset_small,
             optimizers=(optimmizer, lr_scheduler),
-            max_epochs=3)
+            max_epochs=3,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
@@ -225,7 +235,8 @@ class TrainerTest(unittest.TestCase):
             train_dataset=dummy_dataset_big,
             eval_dataset=dummy_dataset_small,
             optimizers=(optimmizer, lr_scheduler),
-            max_epochs=3)
+            max_epochs=3,
+            device='cpu')
 
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
@@ -303,6 +314,124 @@ class TrainerTest(unittest.TestCase):
         for i in [2, 5, 8]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_train_with_iters_per_epoch(self):
+        json_cfg = {
+            'train': {
+                'work_dir': self.tmp_dir,
+                'dataloader': {
+                    'batch_size_per_gpu': 2,
+                    'workers_per_gpu': 1
+                },
+                'hooks': [{
+                    'type': 'EvaluationHook',
+                    'interval': 1
+                }]
+            },
+            'evaluation': {
+                'dataloader': {
+                    'batch_size_per_gpu': 2,
+                    'workers_per_gpu': 1,
+                    'shuffle': False
+                },
+                'metrics': [Metrics.seq_cls_metric]
+            }
+        }
+        config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
+        with open(config_path, 'w') as f:
+            json.dump(json_cfg, f)
+
+        model = DummyModel()
+        optimmizer = SGD(model.parameters(), lr=0.01)
+        lr_scheduler = StepLR(optimmizer, 2)
+        trainer_name = Trainers.default
+        kwargs = dict(
+            cfg_file=config_path,
+            model=model,
+            data_collator=None,
+            optimizers=(optimmizer, lr_scheduler),
+            train_dataset=DummyIterableDataset(),
+            eval_dataset=DummyIterableDataset(),
+            train_iters_per_epoch=20,
+            val_iters_per_epoch=10,
+            max_epochs=3,
+            device='cpu')
+
+        trainer = build_trainer(trainer_name, kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        json_file = os.path.join(self.tmp_dir, f'{trainer.timestamp}.log.json')
+        with open(json_file, 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 10,
+                LogKeys.LR: 0.01
+            }, json.loads(lines[0]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 20,
+                LogKeys.LR: 0.01
+            }, json.loads(lines[1]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 1,
+                LogKeys.ITER: 10
+            }, json.loads(lines[2]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 10,
+                LogKeys.LR: 0.01
+            }, json.loads(lines[3]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 20,
+                LogKeys.LR: 0.01
+            }, json.loads(lines[4]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 2,
+                LogKeys.ITER: 10
+            }, json.loads(lines[5]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 3,
+                LogKeys.ITER: 10,
+                LogKeys.LR: 0.001
+            }, json.loads(lines[6]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.TRAIN,
+                LogKeys.EPOCH: 3,
+                LogKeys.ITER: 20,
+                LogKeys.LR: 0.001
+            }, json.loads(lines[7]))
+        self.assertDictContainsSubset(
+            {
+                LogKeys.MODE: ModeKeys.EVAL,
+                LogKeys.EPOCH: 3,
+                LogKeys.ITER: 10
+            }, json.loads(lines[8]))
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
+        self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
+        for i in [0, 1, 3, 4, 6, 7]:
+            self.assertIn(LogKeys.DATA_LOAD_TIME, lines[i])
+            self.assertIn(LogKeys.ITER_TIME, lines[i])
+        for i in [2, 5, 8]:
+            self.assertIn(MetricKeys.ACCURACY, lines[i])
+
 
 class DummyTrainerTest(unittest.TestCase):
 
diff --git a/tests/trainers/test_trainer_gpu.py b/tests/trainers/test_trainer_gpu.py
index 30390a68..9781816d 100644
--- a/tests/trainers/test_trainer_gpu.py
+++ b/tests/trainers/test_trainer_gpu.py
@@ -11,6 +11,7 @@ import torch
 from torch import nn
 from torch.optim import SGD
 from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import IterableDataset
 
 from modelscope.metainfo import Metrics, Trainers
 from modelscope.metrics.builder import MetricKeys
@@ -19,6 +20,16 @@ from modelscope.utils.constant import LogKeys, ModeKeys, ModelFile
 from modelscope.utils.test_utils import (DistributedTestCase,
                                          create_dummy_test_dataset, test_level)
 
+
+class DummyIterableDataset(IterableDataset):
+
+    def __iter__(self):
+        feat = np.random.random(size=(5, )).astype(np.float32)
+        labels = np.random.randint(0, 4, (1, ))
+        iterations = [{'feat': feat, 'labels': labels}] * 500
+        return iter(iterations)
+
+
 dummy_dataset_small = create_dummy_test_dataset(
     np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 20)
 
@@ -41,7 +52,7 @@ class DummyModel(nn.Module):
         return dict(logits=x, loss=loss)
 
 
-def train_func(work_dir, dist=False):
+def train_func(work_dir, dist=False, iterable_dataset=False, **kwargs):
     json_cfg = {
         'train': {
             'work_dir': work_dir,
@@ -72,18 +83,25 @@ def train_func(work_dir, dist=False):
     optimmizer = SGD(model.parameters(), lr=0.01)
     lr_scheduler = StepLR(optimmizer, 2)
     trainer_name = Trainers.default
-    kwargs = dict(
+    if iterable_dataset:
+        train_dataset = DummyIterableDataset()
+        eval_dataset = DummyIterableDataset()
+    else:
+        train_dataset = dummy_dataset_big
+        eval_dataset = dummy_dataset_small
+    _kwargs = dict(
         cfg_file=config_path,
         model=model,
         data_collator=None,
-        train_dataset=dummy_dataset_big,
-        eval_dataset=dummy_dataset_small,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
         optimizers=(optimmizer, lr_scheduler),
         max_epochs=3,
         device='gpu',
-        launcher='pytorch' if dist else None)
+        launcher='pytorch' if dist else None,
+        **kwargs)
 
-    trainer = build_trainer(trainer_name, kwargs)
+    trainer = build_trainer(trainer_name, _kwargs)
     trainer.train()
 
 
@@ -253,6 +271,28 @@ class TrainerTestMultiGpus(DistributedTestCase):
         for i in [1, 3, 5]:
             self.assertIn(MetricKeys.ACCURACY, lines[i])
 
+    # TODO: support iters_per_epoch for dist mode
+    @unittest.skipIf(True, 'need to adapt to DistributedSampler')
+    def test_multi_gpus_with_iters_per_epoch(self):
+        self.start(
+            train_func,
+            num_gpus=2,
+            work_dir=self.tmp_dir,
+            dist=True,
+            iterable_dataset=True,
+            train_iters_per_epoch=20,
+            val_iters_per_epoch=10,
+        )
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+
+        with open(json_files[0], 'r') as f:
+            lines = [i.strip() for i in f.readlines()]
+
+        print(results_files, lines)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_trainer_with_nlp.py b/tests/trainers/test_trainer_with_nlp.py
index 7e488c6b..213b6b4f 100644
--- a/tests/trainers/test_trainer_with_nlp.py
+++ b/tests/trainers/test_trainer_with_nlp.py
@@ -37,7 +37,8 @@ class TestTrainerWithNlp(unittest.TestCase):
             model=model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            model_revision='beta')
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
@@ -53,7 +54,8 @@ class TestTrainerWithNlp(unittest.TestCase):
             model=model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            work_dir=self.tmp_dir)
+            work_dir=self.tmp_dir,
+            model_revision='beta')
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
@@ -69,7 +71,7 @@ class TestTrainerWithNlp(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_user_defined_config(self):
         model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
-        cfg = read_config(model_id)
+        cfg = read_config(model_id, revision='beta')
         cfg.train.max_epochs = 20
         cfg.train.work_dir = self.tmp_dir
         cfg_file = os.path.join(self.tmp_dir, 'config.json')
@@ -78,7 +80,8 @@ class TestTrainerWithNlp(unittest.TestCase):
             model=model_id,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            cfg_file=cfg_file)
+            cfg_file=cfg_file,
+            model_revision='beta')
 
         trainer = build_trainer(default_args=kwargs)
         trainer.train()
@@ -98,7 +101,7 @@ class TestTrainerWithNlp(unittest.TestCase):
             os.makedirs(tmp_dir)
 
         model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
-        cache_path = snapshot_download(model_id)
+        cache_path = snapshot_download(model_id, revision='beta')
         model = SbertForSequenceClassification.from_pretrained(cache_path)
         kwargs = dict(
             cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
diff --git a/tests/trainers/utils/__init__.py b/tests/trainers/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/trainers/utils/test_inference.py b/tests/trainers/utils/test_inference.py
new file mode 100644
index 00000000..87e5320e
--- /dev/null
+++ b/tests/trainers/utils/test_inference.py
@@ -0,0 +1,116 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+
+from modelscope.metrics.builder import MetricKeys
+from modelscope.metrics.sequence_classification_metric import \
+    SequenceClassificationMetric
+from modelscope.trainers.utils.inference import multi_gpu_test, single_gpu_test
+from modelscope.utils.test_utils import (DistributedTestCase,
+                                         create_dummy_test_dataset, test_level)
+from modelscope.utils.torch_utils import get_dist_info, init_dist
+
+dummy_dataset = create_dummy_test_dataset(
+    torch.rand((5, )), torch.randint(0, 4, (1, )), 20)
+
+
+class DummyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(5, 4)
+        self.bn = nn.BatchNorm1d(4)
+
+    def forward(self, feat, labels):
+        x = self.linear(feat)
+
+        x = self.bn(x)
+        loss = torch.sum(x)
+        return dict(logits=x, loss=loss)
+
+
+def test_func(dist=False):
+    dummy_model = DummyModel()
+    dataset = dummy_dataset.to_torch_dataset()
+
+    dummy_loader = DataLoader(
+        dataset,
+        batch_size=2,
+    )
+
+    metric_class = SequenceClassificationMetric()
+
+    if dist:
+        init_dist(launcher='pytorch')
+
+    rank, world_size = get_dist_info()
+    device = torch.device(f'cuda:{rank}')
+    dummy_model.cuda()
+
+    if world_size > 1:
+        from torch.nn.parallel.distributed import DistributedDataParallel
+        dummy_model = DistributedDataParallel(
+            dummy_model, device_ids=[torch.cuda.current_device()])
+        test_func = multi_gpu_test
+    else:
+        test_func = single_gpu_test
+
+    metric_results = test_func(
+        dummy_model,
+        dummy_loader,
+        device=device,
+        metric_classes=[metric_class])
+
+    return metric_results
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class SingleGpuTestTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_single_gpu_test(self):
+        metric_results = test_func()
+        self.assertIn(MetricKeys.ACCURACY, metric_results)
+
+
+@unittest.skipIf(not torch.cuda.is_available()
+                 or torch.cuda.device_count() <= 1, 'distributed unittest')
+class MultiGpuTestTest(DistributedTestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_multi_gpu_test(self):
+        self.start(
+            test_func,
+            num_gpus=2,
+            assert_callback=lambda x: self.assertIn(MetricKeys.ACCURACY, x),
+            dist=True)
+
+
+if __name__ == '__main__':
+    unittest.main()