Merge branch 'master' into ofa/finetune

# Conflicts: # modelscope/preprocessors/ofa/base.py # modelscope/preprocessors/ofa/image_captioning.py # modelscope/preprocessors/ofa/image_classification.py # modelscope/preprocessors/ofa/summarization.py # modelscope/preprocessors/ofa/text_classification.py # modelscope/preprocessors/ofa/text_to_image_synthesis.py # modelscope/preprocessors/ofa/visual_entailment.py # modelscope/preprocessors/ofa/visual_grounding.py # modelscope/preprocessors/ofa/visual_question_answering.py
3 years ago · 279f64b334
--- a/.gitattributes
+++ b/.gitattributes
@@ -4,3 +4,4 @@
 *.wav filter=lfs diff=lfs merge=lfs -text
 *.JPEG filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.avi filter=lfs diff=lfs merge=lfs -text
--- a/data/test/audios/3ch_nihaomiya.wav
+++ b/data/test/audios/3ch_nihaomiya.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:3ad1a268c614076614a2ae6528abc29cc85ae35826d172079d7d9b26a0299559
 size 4325096
--- a/data/test/audios/farend_speech.wav
+++ b/data/test/audios/farend_speech.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:3637ee0628d0953f77d5a32327980af542c43230c4127d2a72b4df1ea2ffb0be
 size 320042
--- a/data/test/audios/nearend_mic.wav
+++ b/data/test/audios/nearend_mic.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:cc116af609a66f431f94df6b385ff2aa362f8a2d437c2279f5401e47f9178469
 size 320042
--- a/data/test/audios/speech_with_noise.wav
+++ b/data/test/audios/speech_with_noise.wav
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9354345a6297f4522e690d337546aa9a686a7e61eefcd935478a2141b924db8f
 size 76770
--- a/data/test/images/image_salient_detection.jpg
+++ b/data/test/images/image_salient_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:70ea0c06f9cfe3882253f7175221d47e394ab9c469076ab220e880b17dbcdd02
 size 48552
--- a/data/test/images/ocr_recognition_document.png
+++ b/data/test/images/ocr_recognition_document.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:29f2ad929c852f6456367054d13e113078cf06b763fe54d73fd324f789331aa3
 size 61611
--- a/data/test/videos/dog.avi
+++ b/data/test/videos/dog.avi
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:469090fb217a34a2c096cfd42c251da69dca9fcd1a3c1faae7d29183c1816c14
 size 12834294
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -362,8 +362,10 @@ class HubApi:
            dataset_name: str,
            namespace: str,
            revision: Optional[str] = DEFAULT_DATASET_REVISION):
        return f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
               f'Revision={revision}&FilePath={file_name}'
        if file_name.endswith('.csv'):
            file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
                        f'Revision={revision}&FilePath={file_name}'
        return file_name

    def get_dataset_access_config(
            self,
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -38,6 +38,7 @@ class Models(object):
    # audio models
    sambert_hifigan = 'sambert-hifigan'
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
    kws_kwsbp = 'kws-kwsbp'
    generic_asr = 'generic-asr'

@@ -86,6 +87,7 @@ class Pipelines(object):
    body_2d_keypoints = 'hrnetv2w32_body-2d-keypoints_image'
    human_detection = 'resnet18-human-detection'
    object_detection = 'vit-object-detection'
    salient_detection = 'u2net-salient-detection'
    image_classification = 'image-classification'
    face_detection = 'resnet-face-detection-scrfd10gkps'
    live_category = 'live-category'
@@ -109,6 +111,7 @@ class Pipelines(object):
    skin_retouching = 'unet-skin-retouching'
    tinynas_classification = 'tinynas-classification'
    crowd_counting = 'hrnet-crowd-counting'
    video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'

    # nlp tasks
    sentence_similarity = 'sentence-similarity'
@@ -132,6 +135,7 @@ class Pipelines(object):
    sambert_hifigan_tts = 'sambert-hifigan-tts'
    speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
    kws_kwsbp = 'kws-kwsbp'
    asr_inference = 'asr-inference'

@@ -215,7 +219,7 @@ class Preprocessors(object):

    # multi-modal preprocessor
    ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
    mplug_visual_question_answering = 'mplug-visual-question-answering'
    mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'


 class Metrics(object):
--- a/modelscope/models/audio/kws/init.py
+++ b/modelscope/models/audio/kws/init.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .generic_key_word_spotting import GenericKeyWordSpotting
    from .farfield.model import FSMNSeleNetV2Decorator

 else:
    _import_structure = {
        'generic_key_word_spotting': ['GenericKeyWordSpotting'],
        'farfield.model': ['FSMNSeleNetV2Decorator'],
    }

    import sys
--- a/modelscope/models/audio/kws/farfield/init.py
+++ b/modelscope/models/audio/kws/farfield/init.py
--- a/modelscope/models/audio/kws/farfield/fsmn.py
+++ b/modelscope/models/audio/kws/farfield/fsmn.py
@@ -0,0 +1,495 @@
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .model_def import (HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32,
                        printNeonMatrix, printNeonVector)

 DEBUG = False


 def to_kaldi_matrix(np_mat):
    """ function that transform as str numpy mat to standard kaldi str matrix

        Args:
            np_mat:          numpy mat

        Returns:  str
    """
    np.set_printoptions(threshold=np.inf, linewidth=np.nan)
    out_str = str(np_mat)
    out_str = out_str.replace('[', '')
    out_str = out_str.replace(']', '')
    return '[ %s ]\n' % out_str


 def print_tensor(torch_tensor):
    """ print torch tensor for debug

    Args:
        torch_tensor:           a tensor
    """
    re_str = ''
    x = torch_tensor.detach().squeeze().numpy()
    re_str += to_kaldi_matrix(x)
    re_str += '<!EndOfComponent>\n'
    print(re_str)


 class LinearTransform(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(LinearTransform, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.linear = nn.Linear(input_dim, output_dim, bias=False)

        self.debug = False
        self.dataout = None

    def forward(self, input):
        output = self.linear(input)

        if self.debug:
            self.dataout = output

        return output

    def print_model(self):
        printNeonMatrix(self.linear.weight)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<LinearTransform> %d %d\n' % (self.output_dim,
                                                 self.input_dim)
        re_str += '<LearnRateCoef> 1\n'

        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        re_str += '<!EndOfComponent>\n'

        return re_str


 class AffineTransform(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(AffineTransform, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.linear = nn.Linear(input_dim, output_dim)

        self.debug = False
        self.dataout = None

    def forward(self, input):
        output = self.linear(input)

        if self.debug:
            self.dataout = output

        return output

    def print_model(self):
        printNeonMatrix(self.linear.weight)
        printNeonVector(self.linear.bias)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
                                                 self.input_dim)
        re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'

        linear_weights = self.state_dict()['linear.weight']
        x = linear_weights.squeeze().numpy()
        re_str += to_kaldi_matrix(x)

        linear_bias = self.state_dict()['linear.bias']
        x = linear_bias.squeeze().numpy()
        re_str += to_kaldi_matrix(x)
        re_str += '<!EndOfComponent>\n'

        return re_str


 class Fsmn(nn.Module):
    """
    FSMN implementation.
    """

    def __init__(self,
                 input_dim,
                 output_dim,
                 lorder=None,
                 rorder=None,
                 lstride=None,
                 rstride=None):
        super(Fsmn, self).__init__()

        self.dim = input_dim

        if lorder is None:
            return

        self.lorder = lorder
        self.rorder = rorder
        self.lstride = lstride
        self.rstride = rstride

        self.conv_left = nn.Conv2d(
            self.dim,
            self.dim, (lorder, 1),
            dilation=(lstride, 1),
            groups=self.dim,
            bias=False)

        if rorder > 0:
            self.conv_right = nn.Conv2d(
                self.dim,
                self.dim, (rorder, 1),
                dilation=(rstride, 1),
                groups=self.dim,
                bias=False)
        else:
            self.conv_right = None

        self.debug = False
        self.dataout = None

    def forward(self, input):
        x = torch.unsqueeze(input, 1)
        x_per = x.permute(0, 3, 2, 1)

        y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])

        if self.conv_right is not None:
            y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
            y_right = y_right[:, :, self.rstride:, :]
            out = x_per + self.conv_left(y_left) + self.conv_right(y_right)
        else:
            out = x_per + self.conv_left(y_left)

        out1 = out.permute(0, 3, 2, 1)
        output = out1.squeeze(1)

        if self.debug:
            self.dataout = output

        return output

    def print_model(self):
        tmpw = self.conv_left.weight
        tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
        for j in range(tmpw.shape[0]):
            tmpwm[:, j] = tmpw[j, 0, :, 0]

        printNeonMatrix(tmpwm)

        if self.conv_right is not None:
            tmpw = self.conv_right.weight
            tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
            for j in range(tmpw.shape[0]):
                tmpwm[:, j] = tmpw[j, 0, :, 0]

            printNeonMatrix(tmpwm)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Fsmn> %d %d\n' % (self.dim, self.dim)
        re_str += '<LearnRateCoef> %d <LOrder> %d <ROrder> %d <LStride> %d <RStride> %d <MaxNorm> 0\n' % (
            1, self.lorder, self.rorder, self.lstride, self.rstride)

        lfiters = self.state_dict()['conv_left.weight']
        x = np.flipud(lfiters.squeeze().numpy().T)
        re_str += to_kaldi_matrix(x)

        if self.conv_right is not None:
            rfiters = self.state_dict()['conv_right.weight']
            x = (rfiters.squeeze().numpy().T)
            re_str += to_kaldi_matrix(x)
            re_str += '<!EndOfComponent>\n'

        return re_str


 class RectifiedLinear(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(RectifiedLinear, self).__init__()
        self.dim = input_dim
        self.relu = nn.ReLU()

    def forward(self, input):
        return self.relu(input)

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
        re_str += '<!EndOfComponent>\n'
        return re_str


 class FSMNNet(nn.Module):
    """
    FSMN net for keyword spotting
    """

    def __init__(self,
                 input_dim=200,
                 linear_dim=128,
                 proj_dim=128,
                 lorder=10,
                 rorder=1,
                 num_syn=5,
                 fsmn_layers=4):
        """
        Args:
            input_dim:              input dimension
            linear_dim:             fsmn input dimension
            proj_dim:               fsmn projection dimension
            lorder:                 fsmn left order
            rorder:                 fsmn right order
            num_syn:                output dimension
            fsmn_layers:            no. of sequential fsmn layers
        """
        super(FSMNNet, self).__init__()

        self.input_dim = input_dim
        self.linear_dim = linear_dim
        self.proj_dim = proj_dim
        self.lorder = lorder
        self.rorder = rorder
        self.num_syn = num_syn
        self.fsmn_layers = fsmn_layers

        self.linear1 = AffineTransform(input_dim, linear_dim)
        self.relu = RectifiedLinear(linear_dim, linear_dim)

        self.fsmn = self._build_repeats(linear_dim, proj_dim, lorder, rorder,
                                        fsmn_layers)

        self.linear2 = AffineTransform(linear_dim, num_syn)

    @staticmethod
    def _build_repeats(linear_dim=136,
                       proj_dim=68,
                       lorder=3,
                       rorder=2,
                       fsmn_layers=5):
        repeats = [
            nn.Sequential(
                LinearTransform(linear_dim, proj_dim),
                Fsmn(proj_dim, proj_dim, lorder, rorder, 1, 1),
                AffineTransform(proj_dim, linear_dim),
                RectifiedLinear(linear_dim, linear_dim))
            for i in range(fsmn_layers)
        ]

        return nn.Sequential(*repeats)

    def forward(self, input):
        x1 = self.linear1(input)
        x2 = self.relu(x1)
        x3 = self.fsmn(x2)
        x4 = self.linear2(x3)
        return x4

    def print_model(self):
        self.linear1.print_model()

        for layer in self.fsmn:
            layer[0].print_model()
            layer[1].print_model()
            layer[2].print_model()

        self.linear2.print_model()

    def print_header(self):
        #
        # write total header
        #
        header = [0.0] * HEADER_BLOCK_SIZE * 4
        # numins
        header[0] = 0.0
        # numouts
        header[1] = 0.0
        # dimins
        header[2] = self.input_dim
        # dimouts
        header[3] = self.num_syn
        # numlayers
        header[4] = 3

        #
        # write each layer's header
        #
        hidx = 1

        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DENSE.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = self.input_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = self.linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
            ActivationType.ACTIVATION_RELU.value)
        hidx += 1

        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_SEQUENTIAL_FSMN.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = self.proj_dim
        header[HEADER_BLOCK_SIZE * hidx + 4] = self.lorder
        header[HEADER_BLOCK_SIZE * hidx + 5] = self.rorder
        header[HEADER_BLOCK_SIZE * hidx + 6] = self.fsmn_layers
        header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0
        hidx += 1

        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DENSE.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = self.linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = self.num_syn
        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
            ActivationType.ACTIVATION_SOFTMAX.value)

        for h in header:
            print(f32ToI32(h))

    def to_kaldi_nnet(self):
        re_str = ''
        re_str += '<Nnet>\n'
        re_str += self.linear1.to_kaldi_nnet()
        re_str += self.relu.to_kaldi_nnet()

        for fsmn in self.fsmn:
            re_str += fsmn[0].to_kaldi_nnet()
            re_str += fsmn[1].to_kaldi_nnet()
            re_str += fsmn[2].to_kaldi_nnet()
            re_str += fsmn[3].to_kaldi_nnet()

        re_str += self.linear2.to_kaldi_nnet()
        re_str += '<Softmax> %d %d\n' % (self.num_syn, self.num_syn)
        re_str += '<!EndOfComponent>\n'
        re_str += '</Nnet>\n'

        return re_str


 class DFSMN(nn.Module):
    """
    One deep fsmn layer
    """

    def __init__(self,
                 dimproj=64,
                 dimlinear=128,
                 lorder=20,
                 rorder=1,
                 lstride=1,
                 rstride=1):
        """
        Args:
            dimproj:                projection dimension, input and output dimension of memory blocks
            dimlinear:              dimension of mapping layer
            lorder:                 left order
            rorder:                 right order
            lstride:                left stride
            rstride:                right stride
        """
        super(DFSMN, self).__init__()

        self.lorder = lorder
        self.rorder = rorder
        self.lstride = lstride
        self.rstride = rstride

        self.expand = AffineTransform(dimproj, dimlinear)
        self.shrink = LinearTransform(dimlinear, dimproj)

        self.conv_left = nn.Conv2d(
            dimproj,
            dimproj, (lorder, 1),
            dilation=(lstride, 1),
            groups=dimproj,
            bias=False)

        if rorder > 0:
            self.conv_right = nn.Conv2d(
                dimproj,
                dimproj, (rorder, 1),
                dilation=(rstride, 1),
                groups=dimproj,
                bias=False)
        else:
            self.conv_right = None

    def forward(self, input):
        f1 = F.relu(self.expand(input))
        p1 = self.shrink(f1)

        x = torch.unsqueeze(p1, 1)
        x_per = x.permute(0, 3, 2, 1)

        y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])

        if self.conv_right is not None:
            y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
            y_right = y_right[:, :, self.rstride:, :]
            out = x_per + self.conv_left(y_left) + self.conv_right(y_right)
        else:
            out = x_per + self.conv_left(y_left)

        out1 = out.permute(0, 3, 2, 1)
        output = input + out1.squeeze(1)

        return output

    def print_model(self):
        self.expand.print_model()
        self.shrink.print_model()

        tmpw = self.conv_left.weight
        tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
        for j in range(tmpw.shape[0]):
            tmpwm[:, j] = tmpw[j, 0, :, 0]

        printNeonMatrix(tmpwm)

        if self.conv_right is not None:
            tmpw = self.conv_right.weight
            tmpwm = torch.zeros(tmpw.shape[2], tmpw.shape[0])
            for j in range(tmpw.shape[0]):
                tmpwm[:, j] = tmpw[j, 0, :, 0]

            printNeonMatrix(tmpwm)


 def build_dfsmn_repeats(linear_dim=128,
                        proj_dim=64,
                        lorder=20,
                        rorder=1,
                        fsmn_layers=6):
    """
    build stacked dfsmn layers
    Args:
        linear_dim:
        proj_dim:
        lorder:
        rorder:
        fsmn_layers:

    Returns:

    """
    repeats = [
        nn.Sequential(DFSMN(proj_dim, linear_dim, lorder, rorder, 1, 1))
        for i in range(fsmn_layers)
    ]

    return nn.Sequential(*repeats)
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v2.py
@@ -0,0 +1,236 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
 from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32


 class FSMNUnit(nn.Module):
    """ A multi-channel fsmn unit

    """

    def __init__(self, dimlinear=128, dimproj=64, lorder=20, rorder=1):
        """
        Args:
            dimlinear:              input / output dimension
            dimproj:                fsmn input / output dimension
            lorder:                 left ofder
            rorder:                 right order
        """
        super(FSMNUnit, self).__init__()

        self.shrink = LinearTransform(dimlinear, dimproj)
        self.fsmn = Fsmn(dimproj, dimproj, lorder, rorder, 1, 1)
        self.expand = AffineTransform(dimproj, dimlinear)

        self.debug = False
        self.dataout = None

    '''
    batch, time, channel, feature
    '''

    def forward(self, input):
        if torch.cuda.is_available():
            out = torch.zeros(input.shape).cuda()
        else:
            out = torch.zeros(input.shape)

        for n in range(input.shape[2]):
            out1 = self.shrink(input[:, :, n, :])
            out2 = self.fsmn(out1)
            out[:, :, n, :] = F.relu(self.expand(out2))

        if self.debug:
            self.dataout = out

        return out

    def print_model(self):
        self.shrink.print_model()
        self.fsmn.print_model()
        self.expand.print_model()

    def to_kaldi_nnet(self):
        re_str = self.shrink.to_kaldi_nnet()
        re_str += self.fsmn.to_kaldi_nnet()
        re_str += self.expand.to_kaldi_nnet()

        relu = RectifiedLinear(self.expand.linear.out_features,
                               self.expand.linear.out_features)
        re_str += relu.to_kaldi_nnet()

        return re_str


 class FSMNSeleNetV2(nn.Module):
    """ FSMN model with channel selection.
    """

    def __init__(self,
                 input_dim=120,
                 linear_dim=128,
                 proj_dim=64,
                 lorder=20,
                 rorder=1,
                 num_syn=5,
                 fsmn_layers=5,
                 sele_layer=0):
        """
        Args:
            input_dim:              input dimension
            linear_dim:             fsmn input dimension
            proj_dim:               fsmn projection dimension
            lorder:                 fsmn left order
            rorder:                 fsmn right order
            num_syn:                output dimension
            fsmn_layers:            no. of fsmn units
            sele_layer:             channel selection layer index
        """
        super(FSMNSeleNetV2, self).__init__()

        self.sele_layer = sele_layer

        self.featmap = AffineTransform(input_dim, linear_dim)

        self.mem = []
        for i in range(fsmn_layers):
            unit = FSMNUnit(linear_dim, proj_dim, lorder, rorder)
            self.mem.append(unit)
            self.add_module('mem_{:d}'.format(i), unit)

        self.decision = AffineTransform(linear_dim, num_syn)

    def forward(self, input):
        # multi-channel feature mapping
        if torch.cuda.is_available():
            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
                            self.featmap.linear.out_features).cuda()
        else:
            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
                            self.featmap.linear.out_features)

        for n in range(input.shape[2]):
            x[:, :, n, :] = F.relu(self.featmap(input[:, :, n, :]))

        for i, unit in enumerate(self.mem):
            y = unit(x)

            # perform channel selection
            if i == self.sele_layer:
                pool = nn.MaxPool2d((y.shape[2], 1), stride=(y.shape[2], 1))
                y = pool(y)

            x = y

        # remove channel dimension
        y = torch.squeeze(y, -2)
        z = self.decision(y)

        return z

    def print_model(self):
        self.featmap.print_model()

        for unit in self.mem:
            unit.print_model()

        self.decision.print_model()

    def print_header(self):
        '''
        get FSMN params
        '''
        input_dim = self.featmap.linear.in_features
        linear_dim = self.featmap.linear.out_features
        proj_dim = self.mem[0].shrink.linear.out_features
        lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
        rorder = 0
        if self.mem[0].fsmn.conv_right is not None:
            rorder = self.mem[0].fsmn.conv_right.kernel_size[0]

        num_syn = self.decision.linear.out_features
        fsmn_layers = len(self.mem)

        # no. of output channels, 0.0 means the same as numins
        # numouts = 0.0
        numouts = 1.0

        #
        # write total header
        #
        header = [0.0] * HEADER_BLOCK_SIZE * 4
        # numins
        header[0] = 0.0
        # numouts
        header[1] = numouts
        # dimins
        header[2] = input_dim
        # dimouts
        header[3] = num_syn
        # numlayers
        header[4] = 3

        #
        # write each layer's header
        #
        hidx = 1

        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DENSE.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
            ActivationType.ACTIVATION_RELU.value)
        hidx += 1

        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_SEQUENTIAL_FSMN.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = proj_dim
        header[HEADER_BLOCK_SIZE * hidx + 4] = lorder
        header[HEADER_BLOCK_SIZE * hidx + 5] = rorder
        header[HEADER_BLOCK_SIZE * hidx + 6] = fsmn_layers
        if numouts == 1.0:
            header[HEADER_BLOCK_SIZE * hidx + 7] = float(self.sele_layer)
        else:
            header[HEADER_BLOCK_SIZE * hidx + 7] = -1.0
        hidx += 1

        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DENSE.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
            ActivationType.ACTIVATION_SOFTMAX.value)

        for h in header:
            print(f32ToI32(h))

    def to_kaldi_nnet(self):
        re_str = '<Nnet>\n'

        re_str = self.featmap.to_kaldi_nnet()

        relu = RectifiedLinear(self.featmap.linear.out_features,
                               self.featmap.linear.out_features)
        re_str += relu.to_kaldi_nnet()

        for unit in self.mem:
            re_str += unit.to_kaldi_nnet()

        re_str += self.decision.to_kaldi_nnet()

        re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
                                         self.decision.linear.out_features)
        re_str += '<!EndOfComponent>\n'
        re_str += '</Nnet>\n'

        return re_str
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -0,0 +1,74 @@
 import os
 from typing import Dict

 import torch

 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from .fsmn_sele_v2 import FSMNSeleNetV2


@MODELS.register_module(
    Tasks.keyword_spotting, module_name=Models.speech_dfsmn_kws_char_farfield)
 class FSMNSeleNetV2Decorator(TorchModel):
    r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """

    MODEL_TXT = 'model.txt'
    SC_CONFIG = 'sound_connect.conf'
    SC_CONF_ITEM_KWS_MODEL = '${kws_model}'

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the dfsmn model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, *args, **kwargs)
        sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
        model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
        model_bin_file = os.path.join(model_dir,
                                      ModelFile.TORCH_MODEL_BIN_FILE)
        self._model = None
        if os.path.exists(model_bin_file):
            self._model = FSMNSeleNetV2(*args, **kwargs)
            checkpoint = torch.load(model_bin_file)
            self._model.load_state_dict(checkpoint, strict=False)

        self._sc = None
        if os.path.exists(model_txt_file):
            with open(sc_config_file) as f:
                lines = f.readlines()
            with open(sc_config_file, 'w') as f:
                for line in lines:
                    if self.SC_CONF_ITEM_KWS_MODEL in line:
                        line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
                                            model_txt_file)
                    f.write(line)
            import py_sound_connect
            self._sc = py_sound_connect.SoundConnect(sc_config_file)
            self.size_in = self._sc.bytesPerBlockIn()
            self.size_out = self._sc.bytesPerBlockOut()

        if self._model is None and self._sc is None:
            raise Exception(
                f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
            )

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        ...

    def forward_decode(self, data: bytes):
        result = {'pcm': self._sc.process(data, self.size_out)}
        state = self._sc.kwsState()
        if state == 2:
            result['kws'] = {
                'keyword':
                self._sc.kwsKeyword(self._sc.kwsSpottedKeywordIndex()),
                'offset': self._sc.kwsKeywordOffset(),
                'length': self._sc.kwsKeywordLength(),
                'confidence': self._sc.kwsConfidence()
            }
        return result
--- a/modelscope/models/audio/kws/farfield/model_def.py
+++ b/modelscope/models/audio/kws/farfield/model_def.py
@@ -0,0 +1,121 @@
 import math
 import struct
 from enum import Enum

 HEADER_BLOCK_SIZE = 10


 class LayerType(Enum):
    LAYER_DENSE = 1
    LAYER_GRU = 2
    LAYER_ATTENTION = 3
    LAYER_FSMN = 4
    LAYER_SEQUENTIAL_FSMN = 5
    LAYER_FSMN_SELE = 6
    LAYER_GRU_ATTENTION = 7
    LAYER_DFSMN = 8


 class ActivationType(Enum):
    ACTIVATION_NONE = 0
    ACTIVATION_RELU = 1
    ACTIVATION_TANH = 2
    ACTIVATION_SIGMOID = 3
    ACTIVATION_SOFTMAX = 4
    ACTIVATION_LOGSOFTMAX = 5


 def f32ToI32(f):
    """
    print layer
    """
    bs = struct.pack('f', f)

    ba = bytearray()
    ba.append(bs[0])
    ba.append(bs[1])
    ba.append(bs[2])
    ba.append(bs[3])

    return struct.unpack('i', ba)[0]


 def printNeonMatrix(w):
    """
    print matrix with neon padding
    """
    numrows, numcols = w.shape
    numnecols = math.ceil(numcols / 4)

    for i in range(numrows):
        for j in range(numcols):
            print(f32ToI32(w[i, j]))

        for j in range(numnecols * 4 - numcols):
            print(0)


 def printNeonVector(b):
    """
    print vector with neon padding
    """
    size = b.shape[0]
    nesize = math.ceil(size / 4)

    for i in range(size):
        print(f32ToI32(b[i]))

    for i in range(nesize * 4 - size):
        print(0)


 def printDense(layer):
    """
    save dense layer
    """
    statedict = layer.state_dict()
    printNeonMatrix(statedict['weight'])
    printNeonVector(statedict['bias'])


 def printGRU(layer):
    """
    save gru layer
    """
    statedict = layer.state_dict()
    weight = [statedict['weight_ih_l0'], statedict['weight_hh_l0']]
    bias = [statedict['bias_ih_l0'], statedict['bias_hh_l0']]
    numins, numouts = weight[0].shape
    numins = numins // 3

    # output input weights
    w_rx = weight[0][:numins, :]
    w_zx = weight[0][numins:numins * 2, :]
    w_x = weight[0][numins * 2:, :]
    printNeonMatrix(w_zx)
    printNeonMatrix(w_rx)
    printNeonMatrix(w_x)

    # output recurrent weights
    w_rh = weight[1][:numins, :]
    w_zh = weight[1][numins:numins * 2, :]
    w_h = weight[1][numins * 2:, :]
    printNeonMatrix(w_zh)
    printNeonMatrix(w_rh)
    printNeonMatrix(w_h)

    # output input bias
    b_rx = bias[0][:numins]
    b_zx = bias[0][numins:numins * 2]
    b_x = bias[0][numins * 2:]
    printNeonVector(b_zx)
    printNeonVector(b_rx)
    printNeonVector(b_x)

    # output recurrent bias
    b_rh = bias[1][:numins]
    b_zh = bias[1][numins:numins * 2]
    b_h = bias[1][numins * 2:]
    printNeonVector(b_zh)
    printNeonVector(b_rh)
    printNeonVector(b_h)
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -5,4 +5,5 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
               image_colorization, image_denoise, image_instance_segmentation,
               image_portrait_enhancement, image_to_image_generation,
               image_to_image_translation, object_detection,
               product_retrieval_embedding, super_resolution, virual_tryon)
               product_retrieval_embedding, salient_detection,
               super_resolution, video_single_object_tracking, virual_tryon)
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -36,20 +36,8 @@ class NAFNetForImageDenoise(TorchModel):
        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        self.model = NAFNet(**self.config.model.network_g)
        self.loss = PSNRLoss()

        if torch.cuda.is_available():
            self._device = torch.device('cuda')
        else:
            self._device = torch.device('cpu')

        self.model = self.model.to(self._device)
        self.model = self._load_pretrained(self.model, model_path)

        if self.training:
            self.model.train()
        else:
            self.model.eval()

    def _load_pretrained(self,
                         net,
                         load_path,
@@ -109,8 +97,6 @@ class NAFNetForImageDenoise(TorchModel):
        Returns:
            Dict[str, Tensor]: results
        """
        for key, value in inputs.items():
            inputs[key] = inputs[key].to(self._device)
        if self.training:
            return self._train_forward(**inputs)
        elif 'target' in inputs:
--- a/modelscope/models/cv/image_instance_segmentation/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/init.py
@@ -7,13 +7,11 @@ if TYPE_CHECKING:
    from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin
    from .model import CascadeMaskRCNNSwinModel
    from .postprocess_utils import get_img_ins_seg_result
    from .datasets import ImageInstanceSegmentationCocoDataset
 else:
    _import_structure = {
        'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
        'model': ['CascadeMaskRCNNSwinModel'],
        'postprocess_utils': ['get_img_ins_seg_result'],
        'datasets': ['ImageInstanceSegmentationCocoDataset']
    }

    import sys
--- a/modelscope/models/cv/image_instance_segmentation/datasets/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/datasets/init.py
@@ -1,2 +1 @@
 from .dataset import ImageInstanceSegmentationCocoDataset
 from .transforms import build_preprocess_transform
--- a/modelscope/models/cv/object_detection/mmdet_model.py
+++ b/modelscope/models/cv/object_detection/mmdet_model.py
@@ -38,7 +38,7 @@ class DetectionModel(TorchModel):
            self.model, model_path, map_location='cpu')
        self.class_names = checkpoint['meta']['CLASSES']
        config.test_pipeline[0].type = 'LoadImageFromWebcam'
        self.test_pipeline = Compose(
        self.transform_input = Compose(
            replace_ImageToTensor(config.test_pipeline))
        self.model.cfg = config
        self.model.eval()
@@ -56,7 +56,7 @@ class DetectionModel(TorchModel):

        from mmcv.parallel import collate, scatter
        data = dict(img=image)
        data = self.test_pipeline(data)
        data = self.transform_input(data)
        data = collate([data], samples_per_gpu=1)
        data['img_metas'] = [
            img_metas.data[0] for img_metas in data['img_metas']
--- a/modelscope/models/cv/salient_detection/init.py
+++ b/modelscope/models/cv/salient_detection/init.py
@@ -0,0 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .salient_model import SalientDetection

 else:
    _import_structure = {
        'salient_model': ['SalientDetection'],
    }

    import sys

    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/salient_detection/models/init.py
+++ b/modelscope/models/cv/salient_detection/models/init.py
@@ -0,0 +1 @@
 from .u2net import U2NET
--- a/modelscope/models/cv/salient_detection/models/u2net.py
+++ b/modelscope/models/cv/salient_detection/models/u2net.py
@@ -0,0 +1,300 @@
 # Implementation in this file is modifed from source code avaiable via https://github.com/xuebinqin/U-2-Net
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class REBNCONV(nn.Module):

    def __init__(self, in_ch=3, out_ch=3, dirate=1):
        super(REBNCONV, self).__init__()
        self.conv_s1 = nn.Conv2d(
            in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate)
        self.bn_s1 = nn.BatchNorm2d(out_ch)
        self.relu_s1 = nn.ReLU(inplace=True)

    def forward(self, x):
        hx = x
        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
        return xout


 def _upsample_like(src, tar):
    """upsample tensor 'src' to have the same spatial size with tensor 'tar'."""
    src = F.upsample(src, size=tar.shape[2:], mode='bilinear')
    return src


 class RSU7(nn.Module):

    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
        super(RSU7, self).__init__()
        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

    def forward(self, x):
        hx = x
        hxin = self.rebnconvin(hx)
        hx1 = self.rebnconv1(hxin)
        hx = self.pool1(hx1)
        hx2 = self.rebnconv2(hx)
        hx = self.pool2(hx2)
        hx3 = self.rebnconv3(hx)
        hx = self.pool3(hx3)
        hx4 = self.rebnconv4(hx)
        hx = self.pool4(hx4)
        hx5 = self.rebnconv5(hx)
        hx = self.pool5(hx5)
        hx6 = self.rebnconv6(hx)
        hx7 = self.rebnconv7(hx6)
        hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
        hx6dup = _upsample_like(hx6d, hx5)
        hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
        hx5dup = _upsample_like(hx5d, hx4)
        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
        hx4dup = _upsample_like(hx4d, hx3)
        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
        hx3dup = _upsample_like(hx3d, hx2)
        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
        hx2dup = _upsample_like(hx2d, hx1)
        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
        return hx1d + hxin


 class RSU6(nn.Module):

    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
        super(RSU6, self).__init__()

        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

    def forward(self, x):
        hx = x
        hxin = self.rebnconvin(hx)
        hx1 = self.rebnconv1(hxin)
        hx = self.pool1(hx1)
        hx2 = self.rebnconv2(hx)
        hx = self.pool2(hx2)
        hx3 = self.rebnconv3(hx)
        hx = self.pool3(hx3)
        hx4 = self.rebnconv4(hx)
        hx = self.pool4(hx4)
        hx5 = self.rebnconv5(hx)
        hx6 = self.rebnconv6(hx5)
        hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
        hx5dup = _upsample_like(hx5d, hx4)
        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
        hx4dup = _upsample_like(hx4d, hx3)
        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
        hx3dup = _upsample_like(hx3d, hx2)
        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
        hx2dup = _upsample_like(hx2d, hx1)
        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
        return hx1d + hxin


 class RSU5(nn.Module):

    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
        super(RSU5, self).__init__()

        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

    def forward(self, x):
        hx = x
        hxin = self.rebnconvin(hx)
        hx1 = self.rebnconv1(hxin)
        hx = self.pool1(hx1)
        hx2 = self.rebnconv2(hx)
        hx = self.pool2(hx2)
        hx3 = self.rebnconv3(hx)
        hx = self.pool3(hx3)
        hx4 = self.rebnconv4(hx)
        hx5 = self.rebnconv5(hx4)
        hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
        hx4dup = _upsample_like(hx4d, hx3)
        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
        hx3dup = _upsample_like(hx3d, hx2)
        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
        hx2dup = _upsample_like(hx2d, hx1)
        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
        return hx1d + hxin


 class RSU4(nn.Module):

    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
        super(RSU4, self).__init__()

        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

    def forward(self, x):

        hx = x
        hxin = self.rebnconvin(hx)
        hx1 = self.rebnconv1(hxin)
        hx = self.pool1(hx1)
        hx2 = self.rebnconv2(hx)
        hx = self.pool2(hx2)
        hx3 = self.rebnconv3(hx)
        hx4 = self.rebnconv4(hx3)
        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
        hx3dup = _upsample_like(hx3d, hx2)
        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
        hx2dup = _upsample_like(hx2d, hx1)
        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
        return hx1d + hxin


 class RSU4F(nn.Module):

    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
        super(RSU4F, self).__init__()

        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)

    def forward(self, x):

        hx = x
        hxin = self.rebnconvin(hx)
        hx1 = self.rebnconv1(hxin)
        hx2 = self.rebnconv2(hx1)
        hx3 = self.rebnconv3(hx2)
        hx4 = self.rebnconv4(hx3)
        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
        hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
        hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
        return hx1d + hxin


 class U2NET(nn.Module):

    def __init__(self, in_ch=3, out_ch=1):
        super(U2NET, self).__init__()

        # encoder
        self.stage1 = RSU7(in_ch, 32, 64)
        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.stage2 = RSU6(64, 32, 128)
        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.stage3 = RSU5(128, 64, 256)
        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.stage4 = RSU4(256, 128, 512)
        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.stage5 = RSU4F(512, 256, 512)
        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
        self.stage6 = RSU4F(512, 256, 512)
        # decoder
        self.stage5d = RSU4F(1024, 256, 512)
        self.stage4d = RSU4(1024, 128, 256)
        self.stage3d = RSU5(512, 64, 128)
        self.stage2d = RSU6(256, 32, 64)
        self.stage1d = RSU7(128, 16, 64)
        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
        self.outconv = nn.Conv2d(6 * out_ch, out_ch, 1)

    def forward(self, x):

        hx = x
        hx1 = self.stage1(hx)
        hx = self.pool12(hx1)
        hx2 = self.stage2(hx)
        hx = self.pool23(hx2)
        hx3 = self.stage3(hx)
        hx = self.pool34(hx3)
        hx4 = self.stage4(hx)
        hx = self.pool45(hx4)
        hx5 = self.stage5(hx)
        hx = self.pool56(hx5)
        hx6 = self.stage6(hx)
        hx6up = _upsample_like(hx6, hx5)

        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
        hx5dup = _upsample_like(hx5d, hx4)
        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
        hx4dup = _upsample_like(hx4d, hx3)
        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
        hx3dup = _upsample_like(hx3d, hx2)
        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
        hx2dup = _upsample_like(hx2d, hx1)
        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
        d1 = self.side1(hx1d)
        d2 = self.side2(hx2d)
        d2 = _upsample_like(d2, d1)
        d3 = self.side3(hx3d)
        d3 = _upsample_like(d3, d1)
        d4 = self.side4(hx4d)
        d4 = _upsample_like(d4, d1)
        d5 = self.side5(hx5d)
        d5 = _upsample_like(d5, d1)
        d6 = self.side6(hx6)
        d6 = _upsample_like(d6, d1)
        d0 = self.outconv(torch.cat((d1, d2, d3, d4, d5, d6), 1))
        return torch.sigmoid(d0), torch.sigmoid(d1), torch.sigmoid(
            d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(
                d5), torch.sigmoid(d6)
--- a/modelscope/models/cv/salient_detection/salient_model.py
+++ b/modelscope/models/cv/salient_detection/salient_model.py
@@ -0,0 +1,63 @@
 import os.path as osp

 import cv2
 import numpy as np
 import torch
 from PIL import Image
 from torchvision import transforms

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
 from .models import U2NET


@MODELS.register_module(Tasks.image_segmentation, module_name=Models.detection)
 class SalientDetection(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        """str -- model file root."""
        super().__init__(model_dir, *args, **kwargs)
        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
        self.model = U2NET(3, 1)
        checkpoint = torch.load(model_path, map_location='cpu')
        self.transform_input = transforms.Compose([
            transforms.Resize((320, 320)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.model.load_state_dict(checkpoint)
        self.model.eval()

    def inference(self, data):
        """data is tensor 3 * H * W ---> return tensor H * W ."""
        data = data.unsqueeze(0)
        if next(self.model.parameters()).is_cuda:
            data = data.to(
                torch.device([next(self.model.parameters()).device][0]))

        with torch.no_grad():
            results = self.model(data)

        if next(self.model.parameters()).is_cuda:
            return results[0][0, 0, :, :].cpu()
        return results[0][0, 0, :, :]

    def preprocess(self, image):
        """image is numpy."""
        data = self.transform_input(Image.fromarray(image))
        return data.float()

    def postprocess(self, inputs):
        """resize ."""
        data = inputs['data']
        w = inputs['img_w']
        h = inputs['img_h']
        data_norm = (data - torch.min(data)) / (
            torch.max(data) - torch.min(data))
        data_norm_np = (data_norm.numpy() * 255).astype('uint8')
        data_norm_rst = cv2.resize(data_norm_np, (w, h))

        return data_norm_rst
--- a/modelscope/models/cv/video_single_object_tracking/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/init.py
--- a/modelscope/models/cv/video_single_object_tracking/config/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/config/init.py
--- a/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/config/ostrack.py
@@ -0,0 +1,39 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 from easydict import EasyDict as edict

 cfg = edict()

 # MODEL
 cfg.MODEL = edict()

 # MODEL.BACKBONE
 cfg.MODEL.BACKBONE = edict()
 cfg.MODEL.BACKBONE.TYPE = 'vit_base_patch16_224_ce'
 cfg.MODEL.BACKBONE.STRIDE = 16
 cfg.MODEL.BACKBONE.CAT_MODE = 'direct'
 cfg.MODEL.BACKBONE.DROP_PATH_RATE = 0.1
 cfg.MODEL.BACKBONE.CE_LOC = [3, 6, 9]
 cfg.MODEL.BACKBONE.CE_KEEP_RATIO = [0.7, 0.7, 0.7]
 cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE = 'CTR_POINT'

 # MODEL.HEAD
 cfg.MODEL.HEAD = edict()
 cfg.MODEL.HEAD.TYPE = 'CENTER'
 cfg.MODEL.HEAD.NUM_CHANNELS = 256

 # DATA
 cfg.DATA = edict()
 cfg.DATA.MEAN = [0.485, 0.456, 0.406]
 cfg.DATA.STD = [0.229, 0.224, 0.225]
 cfg.DATA.SEARCH = edict()
 cfg.DATA.SEARCH.SIZE = 384
 cfg.DATA.TEMPLATE = edict()
 cfg.DATA.TEMPLATE.SIZE = 192

 # TEST
 cfg.TEST = edict()
 cfg.TEST.TEMPLATE_FACTOR = 2.0
 cfg.TEST.TEMPLATE_SIZE = 192
 cfg.TEST.SEARCH_FACTOR = 5.0
 cfg.TEST.SEARCH_SIZE = 384
--- a/modelscope/models/cv/video_single_object_tracking/models/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/init.py
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/init.py
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn.py
@@ -0,0 +1,54 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch.nn as nn


 class Attention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 attn_drop=0.,
                 proj_drop=0.,
                 rpe=False,
                 z_size=7,
                 x_size=14):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x, mask=None, return_attention=False):
        # x: B, N, C
        # mask: [B, N, ] torch.bool
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv.unbind(
            0)  # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose(-2, -1)) * self.scale

        if mask is not None:
            attn = attn.masked_fill(
                mask.unsqueeze(1).unsqueeze(2),
                float('-inf'),
            )

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)

        if return_attention:
            return x, attn
        else:
            return x
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/attn_blocks.py
@@ -0,0 +1,129 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import math

 import torch
 import torch.nn as nn
 from timm.models.layers import DropPath, Mlp

 from .attn import Attention


 def candidate_elimination(attn: torch.Tensor, tokens: torch.Tensor,
                          lens_t: int, keep_ratio: float,
                          global_index: torch.Tensor,
                          box_mask_z: torch.Tensor):
    """
    Eliminate potential background candidates for computation reduction and noise cancellation.
    Args:
        attn (torch.Tensor): [B, num_heads, L_t + L_s, L_t + L_s], attention weights
        tokens (torch.Tensor):  [B, L_t + L_s, C], template and search region tokens
        lens_t (int): length of template
        keep_ratio (float): keep ratio of search region tokens (candidates)
        global_index (torch.Tensor): global index of search region tokens
        box_mask_z (torch.Tensor): template mask used to accumulate attention weights

    Returns:
        tokens_new (torch.Tensor): tokens after candidate elimination
        keep_index (torch.Tensor): indices of kept search region tokens
        removed_index (torch.Tensor): indices of removed search region tokens
    """
    lens_s = attn.shape[-1] - lens_t
    bs, hn, _, _ = attn.shape

    lens_keep = math.ceil(keep_ratio * lens_s)
    if lens_keep == lens_s:
        return tokens, global_index, None

    attn_t = attn[:, :, :lens_t, lens_t:]

    if box_mask_z is not None:
        box_mask_z = box_mask_z.unsqueeze(1).unsqueeze(-1).expand(
            -1, attn_t.shape[1], -1, attn_t.shape[-1])
        attn_t = attn_t[box_mask_z]
        attn_t = attn_t.view(bs, hn, -1, lens_s)
        attn_t = attn_t.mean(dim=2).mean(dim=1)  # B, H, L-T, L_s --> B, L_s
    else:
        attn_t = attn_t.mean(dim=2).mean(dim=1)  # B, H, L-T, L_s --> B, L_s

    # use sort instead of topk, due to the speed issue
    # https://github.com/pytorch/pytorch/issues/22812
    sorted_attn, indices = torch.sort(attn_t, dim=1, descending=True)

    _, topk_idx = sorted_attn[:, :lens_keep], indices[:, :lens_keep]
    _, non_topk_idx = sorted_attn[:, lens_keep:], indices[:, lens_keep:]
    keep_index = global_index.gather(dim=1, index=topk_idx)
    removed_index = global_index.gather(dim=1, index=non_topk_idx)

    # separate template and search tokens
    tokens_t = tokens[:, :lens_t]
    tokens_s = tokens[:, lens_t:]

    # obtain the attentive and inattentive tokens
    B, L, C = tokens_s.shape
    attentive_tokens = tokens_s.gather(
        dim=1, index=topk_idx.unsqueeze(-1).expand(B, -1, C))

    # concatenate these tokens
    tokens_new = torch.cat([tokens_t, attentive_tokens], dim=1)

    return tokens_new, keep_index, removed_index


 class CEBlock(nn.Module):

    def __init__(
        self,
        dim,
        num_heads,
        mlp_ratio=4.,
        qkv_bias=False,
        drop=0.,
        attn_drop=0.,
        drop_path=0.,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
        keep_ratio_search=1.0,
    ):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            attn_drop=attn_drop,
            proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)

        self.keep_ratio_search = keep_ratio_search

    def forward(self,
                x,
                global_index_template,
                global_index_search,
                mask=None,
                ce_template_mask=None,
                keep_ratio_search=None):
        x_attn, attn = self.attn(self.norm1(x), mask, True)
        x = x + self.drop_path(x_attn)
        lens_t = global_index_template.shape[1]

        removed_index_search = None
        if self.keep_ratio_search < 1 and (keep_ratio_search is None
                                           or keep_ratio_search < 1):
            keep_ratio_search = self.keep_ratio_search if keep_ratio_search is None else keep_ratio_search
            x, global_index_search, removed_index_search = candidate_elimination(
                attn, x, lens_t, keep_ratio_search, global_index_search,
                ce_template_mask)

        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x, global_index_template, global_index_search, removed_index_search, attn
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/head.py
@@ -0,0 +1,141 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch
 import torch.nn as nn


 def conv(in_planes,
         out_planes,
         kernel_size=3,
         stride=1,
         padding=1,
         dilation=1):
    return nn.Sequential(
        nn.Conv2d(
            in_planes,
            out_planes,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=True), nn.BatchNorm2d(out_planes), nn.ReLU(inplace=True))


 class CenterPredictor(
        nn.Module, ):

    def __init__(self, inplanes=64, channel=256, feat_sz=20, stride=16):
        super(CenterPredictor, self).__init__()
        self.feat_sz = feat_sz
        self.stride = stride
        self.img_sz = self.feat_sz * self.stride

        # corner predict
        self.conv1_ctr = conv(inplanes, channel)
        self.conv2_ctr = conv(channel, channel // 2)
        self.conv3_ctr = conv(channel // 2, channel // 4)
        self.conv4_ctr = conv(channel // 4, channel // 8)
        self.conv5_ctr = nn.Conv2d(channel // 8, 1, kernel_size=1)

        # offset regress
        self.conv1_offset = conv(inplanes, channel)
        self.conv2_offset = conv(channel, channel // 2)
        self.conv3_offset = conv(channel // 2, channel // 4)
        self.conv4_offset = conv(channel // 4, channel // 8)
        self.conv5_offset = nn.Conv2d(channel // 8, 2, kernel_size=1)

        # size regress
        self.conv1_size = conv(inplanes, channel)
        self.conv2_size = conv(channel, channel // 2)
        self.conv3_size = conv(channel // 2, channel // 4)
        self.conv4_size = conv(channel // 4, channel // 8)
        self.conv5_size = nn.Conv2d(channel // 8, 2, kernel_size=1)

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, x, gt_score_map=None):
        """ Forward pass with input x. """
        score_map_ctr, size_map, offset_map = self.get_score_map(x)

        # assert gt_score_map is None
        if gt_score_map is None:
            bbox = self.cal_bbox(score_map_ctr, size_map, offset_map)
        else:
            bbox = self.cal_bbox(
                gt_score_map.unsqueeze(1), size_map, offset_map)

        return score_map_ctr, bbox, size_map, offset_map

    def cal_bbox(self,
                 score_map_ctr,
                 size_map,
                 offset_map,
                 return_score=False):
        max_score, idx = torch.max(
            score_map_ctr.flatten(1), dim=1, keepdim=True)
        idx_y = idx // self.feat_sz
        idx_x = idx % self.feat_sz

        idx = idx.unsqueeze(1).expand(idx.shape[0], 2, 1)
        size = size_map.flatten(2).gather(dim=2, index=idx)
        offset = offset_map.flatten(2).gather(dim=2, index=idx).squeeze(-1)

        # cx, cy, w, h
        bbox = torch.cat(
            [(idx_x.to(torch.float) + offset[:, :1]) / self.feat_sz,
             (idx_y.to(torch.float) + offset[:, 1:]) / self.feat_sz,
             size.squeeze(-1)],
            dim=1)

        if return_score:
            return bbox, max_score
        return bbox

    def get_score_map(self, x):

        def _sigmoid(x):
            y = torch.clamp(x.sigmoid_(), min=1e-4, max=1 - 1e-4)
            return y

        # ctr branch
        x_ctr1 = self.conv1_ctr(x)
        x_ctr2 = self.conv2_ctr(x_ctr1)
        x_ctr3 = self.conv3_ctr(x_ctr2)
        x_ctr4 = self.conv4_ctr(x_ctr3)
        score_map_ctr = self.conv5_ctr(x_ctr4)

        # offset branch
        x_offset1 = self.conv1_offset(x)
        x_offset2 = self.conv2_offset(x_offset1)
        x_offset3 = self.conv3_offset(x_offset2)
        x_offset4 = self.conv4_offset(x_offset3)
        score_map_offset = self.conv5_offset(x_offset4)

        # size branch
        x_size1 = self.conv1_size(x)
        x_size2 = self.conv2_size(x_size1)
        x_size3 = self.conv3_size(x_size2)
        x_size4 = self.conv4_size(x_size3)
        score_map_size = self.conv5_size(x_size4)
        return _sigmoid(score_map_ctr), _sigmoid(
            score_map_size), score_map_offset


 def build_box_head(cfg, hidden_dim):
    stride = cfg.MODEL.BACKBONE.STRIDE

    if cfg.MODEL.HEAD.TYPE == 'CENTER':
        in_channel = hidden_dim
        out_channel = cfg.MODEL.HEAD.NUM_CHANNELS
        feat_sz = int(cfg.DATA.SEARCH.SIZE / stride)
        center_head = CenterPredictor(
            inplanes=in_channel,
            channel=out_channel,
            feat_sz=feat_sz,
            stride=stride)
        return center_head
    else:
        raise ValueError('HEAD TYPE %s is not supported.'
                         % cfg.MODEL.HEAD_TYPE)
--- a/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/layers/patch_embed.py
@@ -0,0 +1,37 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple


 class PatchEmbed(nn.Module):
    """ 2D Image to Patch Embedding
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 norm_layer=None,
                 flatten=True):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0],
                          img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.flatten = flatten

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
        return x
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/init.py
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/base_backbone.py
@@ -0,0 +1,93 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch.nn as nn
 from timm.models.layers import to_2tuple

 from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \
    PatchEmbed


 class BaseBackbone(nn.Module):

    def __init__(self):
        super().__init__()

        # for original ViT
        self.pos_embed = None
        self.img_size = [224, 224]
        self.patch_size = 16
        self.embed_dim = 384

        self.cat_mode = 'direct'

        self.pos_embed_z = None
        self.pos_embed_x = None

        self.template_segment_pos_embed = None
        self.search_segment_pos_embed = None

        self.return_stage = [2, 5, 8, 11]

    def finetune_track(self, cfg, patch_start_index=1):

        search_size = to_2tuple(cfg.DATA.SEARCH.SIZE)
        template_size = to_2tuple(cfg.DATA.TEMPLATE.SIZE)
        new_patch_size = cfg.MODEL.BACKBONE.STRIDE

        self.cat_mode = cfg.MODEL.BACKBONE.CAT_MODE

        # resize patch embedding
        if new_patch_size != self.patch_size:
            print(
                'Inconsistent Patch Size With The Pretrained Weights, Interpolate The Weight!'
            )
            old_patch_embed = {}
            for name, param in self.patch_embed.named_parameters():
                if 'weight' in name:
                    param = nn.functional.interpolate(
                        param,
                        size=(new_patch_size, new_patch_size),
                        mode='bicubic',
                        align_corners=False)
                    param = nn.Parameter(param)
                old_patch_embed[name] = param
            self.patch_embed = PatchEmbed(
                img_size=self.img_size,
                patch_size=new_patch_size,
                in_chans=3,
                embed_dim=self.embed_dim)
            self.patch_embed.proj.bias = old_patch_embed['proj.bias']
            self.patch_embed.proj.weight = old_patch_embed['proj.weight']

        # for patch embedding
        patch_pos_embed = self.pos_embed[:, patch_start_index:, :]
        patch_pos_embed = patch_pos_embed.transpose(1, 2)
        B, E, Q = patch_pos_embed.shape
        P_H, P_W = self.img_size[0] // self.patch_size, self.img_size[
            1] // self.patch_size
        patch_pos_embed = patch_pos_embed.view(B, E, P_H, P_W)

        # for search region
        H, W = search_size
        new_P_H, new_P_W = H // new_patch_size, W // new_patch_size
        search_patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed,
            size=(new_P_H, new_P_W),
            mode='bicubic',
            align_corners=False)
        search_patch_pos_embed = search_patch_pos_embed.flatten(2).transpose(
            1, 2)

        # for template region
        H, W = template_size
        new_P_H, new_P_W = H // new_patch_size, W // new_patch_size
        template_patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed,
            size=(new_P_H, new_P_W),
            mode='bicubic',
            align_corners=False)
        template_patch_pos_embed = template_patch_pos_embed.flatten(
            2).transpose(1, 2)

        self.pos_embed_z = nn.Parameter(template_patch_pos_embed)
        self.pos_embed_x = nn.Parameter(search_patch_pos_embed)
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/ostrack.py
@@ -0,0 +1,109 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch
 from torch import nn

 from modelscope.models.cv.video_single_object_tracking.models.layers.head import \
    build_box_head
 from .vit_ce import vit_base_patch16_224_ce


 class OSTrack(nn.Module):
    """ This is the base class for OSTrack """

    def __init__(self,
                 transformer,
                 box_head,
                 aux_loss=False,
                 head_type='CORNER'):
        """ Initializes the model.
        Parameters:
            transformer: torch module of the transformer architecture.
            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
        """
        super().__init__()
        self.backbone = transformer
        self.box_head = box_head

        self.aux_loss = aux_loss
        self.head_type = head_type
        if head_type == 'CORNER' or head_type == 'CENTER':
            self.feat_sz_s = int(box_head.feat_sz)
            self.feat_len_s = int(box_head.feat_sz**2)

    def forward(
        self,
        template: torch.Tensor,
        search: torch.Tensor,
        ce_template_mask=None,
        ce_keep_rate=None,
    ):
        x, aux_dict = self.backbone(
            z=template,
            x=search,
            ce_template_mask=ce_template_mask,
            ce_keep_rate=ce_keep_rate,
        )

        # Forward head
        feat_last = x
        if isinstance(x, list):
            feat_last = x[-1]
        out = self.forward_head(feat_last, None)

        out.update(aux_dict)
        out['backbone_feat'] = x
        return out

    def forward_head(self, cat_feature, gt_score_map=None):
        """
        cat_feature: output embeddings of the backbone, it can be (HW1+HW2, B, C) or (HW2, B, C)
        """
        enc_opt = cat_feature[:, -self.
                              feat_len_s:]  # encoder output for the search region (B, HW, C)
        opt = (enc_opt.unsqueeze(-1)).permute((0, 3, 2, 1)).contiguous()
        bs, Nq, C, HW = opt.size()
        opt_feat = opt.view(-1, C, self.feat_sz_s, self.feat_sz_s)

        if self.head_type == 'CENTER':
            # run the center head
            score_map_ctr, bbox, size_map, offset_map = self.box_head(
                opt_feat, gt_score_map)
            outputs_coord = bbox
            outputs_coord_new = outputs_coord.view(bs, Nq, 4)
            out = {
                'pred_boxes': outputs_coord_new,
                'score_map': score_map_ctr,
                'size_map': size_map,
                'offset_map': offset_map
            }
            return out
        else:
            raise NotImplementedError


 def build_ostrack(cfg):
    if cfg.MODEL.BACKBONE.TYPE == 'vit_base_patch16_224_ce':
        backbone = vit_base_patch16_224_ce(
            False,
            drop_path_rate=cfg.MODEL.BACKBONE.DROP_PATH_RATE,
            ce_loc=cfg.MODEL.BACKBONE.CE_LOC,
            ce_keep_ratio=cfg.MODEL.BACKBONE.CE_KEEP_RATIO,
        )
        hidden_dim = backbone.embed_dim
        patch_start_index = 1
    else:
        raise NotImplementedError

    backbone.finetune_track(cfg=cfg, patch_start_index=patch_start_index)

    box_head = build_box_head(cfg, hidden_dim)

    model = OSTrack(
        backbone,
        box_head,
        aux_loss=False,
        head_type=cfg.MODEL.HEAD.TYPE,
    )

    return model
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/utils.py
@@ -0,0 +1,24 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch


 def combine_tokens(template_tokens,
                   search_tokens,
                   mode='direct',
                   return_res=False):
    if mode == 'direct':
        merged_feature = torch.cat((template_tokens, search_tokens), dim=1)
    else:
        raise NotImplementedError

    return merged_feature


 def recover_tokens(merged_tokens, mode='direct'):
    if mode == 'direct':
        recovered_tokens = merged_tokens
    else:
        raise NotImplementedError

    return recovered_tokens
--- a/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
+++ b/modelscope/models/cv/video_single_object_tracking/models/ostrack/vit_ce.py
@@ -0,0 +1,343 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 from functools import partial

 import torch
 import torch.nn as nn
 from timm.models.layers import DropPath, Mlp, to_2tuple

 from modelscope.models.cv.video_single_object_tracking.models.layers.attn_blocks import \
    CEBlock
 from modelscope.models.cv.video_single_object_tracking.models.layers.patch_embed import \
    PatchEmbed
 from .base_backbone import BaseBackbone
 from .utils import combine_tokens, recover_tokens


 class Attention(nn.Module):

    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 attn_drop=0.,
                 proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)


 class Block(nn.Module):

    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.,
                 qkv_bias=False,
                 drop=0.,
                 attn_drop=0.,
                 drop_path=0.,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            attn_drop=attn_drop,
            proj_drop=drop)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop)


 class VisionTransformer(BaseBackbone):
    """ Vision Transformer
    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
        - https://arxiv.org/abs/2010.11929
    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
        - https://arxiv.org/abs/2012.12877
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 num_classes=1000,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 distilled=False,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 embed_layer=PatchEmbed,
                 norm_layer=None,
                 act_layer=None):
        """
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            num_classes (int): number of classes for classification head
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            distilled (bool): model includes a distillation token and head as in DeiT models
            drop_rate (float): dropout rate
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            embed_layer (nn.Module): patch embedding layer
            norm_layer: (nn.Module): normalization layer
        """
        super().__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.num_tokens = 2 if distilled else 1
        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
        act_layer = act_layer or nn.GELU

        self.patch_embed = embed_layer(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.dist_token = None
        self.pos_embed = nn.Parameter(
            torch.zeros(1, num_patches + self.num_tokens, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
               ]  # stochastic depth decay rule
        self.blocks = nn.Sequential(*[
            Block(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                act_layer=act_layer) for i in range(depth)
        ])
        self.norm = norm_layer(embed_dim)


 class VisionTransformerCE(VisionTransformer):
    """ Vision Transformer with candidate elimination (CE) module

    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
        - https://arxiv.org/abs/2010.11929

    Includes distillation token & head support for `DeiT: Data-efficient Image Transformers`
        - https://arxiv.org/abs/2012.12877
    """

    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 num_classes=1000,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4.,
                 qkv_bias=True,
                 distilled=False,
                 drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0.,
                 embed_layer=PatchEmbed,
                 norm_layer=None,
                 act_layer=None,
                 ce_loc=None,
                 ce_keep_ratio=None):
        """
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            num_classes (int): number of classes for classification head
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            distilled (bool): model includes a distillation token and head as in DeiT models
            drop_rate (float): dropout rate
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            embed_layer (nn.Module): patch embedding layer
            norm_layer: (nn.Module): normalization layer
        """
        super().__init__()
        if isinstance(img_size, tuple):
            self.img_size = img_size
        else:
            self.img_size = to_2tuple(img_size)
        self.patch_size = patch_size
        self.in_chans = in_chans

        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.num_tokens = 2 if distilled else 1
        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
        act_layer = act_layer or nn.GELU

        self.patch_embed = embed_layer(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.dist_token = nn.Parameter(torch.zeros(
            1, 1, embed_dim)) if distilled else None
        self.pos_embed = nn.Parameter(
            torch.zeros(1, num_patches + self.num_tokens, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
               ]  # stochastic depth decay rule
        blocks = []
        ce_index = 0
        self.ce_loc = ce_loc
        for i in range(depth):
            ce_keep_ratio_i = 1.0
            if ce_loc is not None and i in ce_loc:
                ce_keep_ratio_i = ce_keep_ratio[ce_index]
                ce_index += 1

            blocks.append(
                CEBlock(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    act_layer=act_layer,
                    keep_ratio_search=ce_keep_ratio_i))

        self.blocks = nn.Sequential(*blocks)
        self.norm = norm_layer(embed_dim)

    def forward_features(
        self,
        z,
        x,
        mask_x=None,
        ce_template_mask=None,
        ce_keep_rate=None,
    ):
        B = x.shape[0]

        x = self.patch_embed(x)
        z = self.patch_embed(z)

        z += self.pos_embed_z
        x += self.pos_embed_x

        x = combine_tokens(z, x, mode=self.cat_mode)

        x = self.pos_drop(x)

        lens_z = self.pos_embed_z.shape[1]
        lens_x = self.pos_embed_x.shape[1]

        global_index_t = torch.linspace(0, lens_z - 1, lens_z).to(x.device)
        global_index_t = global_index_t.repeat(B, 1)

        global_index_s = torch.linspace(0, lens_x - 1, lens_x).to(x.device)
        global_index_s = global_index_s.repeat(B, 1)
        removed_indexes_s = []
        for i, blk in enumerate(self.blocks):
            x, global_index_t, global_index_s, removed_index_s, attn = \
                blk(x, global_index_t, global_index_s, mask_x, ce_template_mask, ce_keep_rate)

            if self.ce_loc is not None and i in self.ce_loc:
                removed_indexes_s.append(removed_index_s)

        x = self.norm(x)
        lens_x_new = global_index_s.shape[1]
        lens_z_new = global_index_t.shape[1]

        z = x[:, :lens_z_new]
        x = x[:, lens_z_new:]

        if removed_indexes_s and removed_indexes_s[0] is not None:
            removed_indexes_cat = torch.cat(removed_indexes_s, dim=1)

            pruned_lens_x = lens_x - lens_x_new
            pad_x = torch.zeros([B, pruned_lens_x, x.shape[2]],
                                device=x.device)
            x = torch.cat([x, pad_x], dim=1)
            index_all = torch.cat([global_index_s, removed_indexes_cat], dim=1)
            # recover original token order
            C = x.shape[-1]
            x = torch.zeros_like(x).scatter_(
                dim=1,
                index=index_all.unsqueeze(-1).expand(B, -1, C).to(torch.int64),
                src=x)

        x = recover_tokens(x, mode=self.cat_mode)

        # re-concatenate with the template, which may be further used by other modules
        x = torch.cat([z, x], dim=1)

        aux_dict = {
            'attn': attn,
            'removed_indexes_s': removed_indexes_s,  # used for visualization
        }

        return x, aux_dict

    def forward(self, z, x, ce_template_mask=None, ce_keep_rate=None):

        x, aux_dict = self.forward_features(
            z,
            x,
            ce_template_mask=ce_template_mask,
            ce_keep_rate=ce_keep_rate,
        )

        return x, aux_dict


 def _create_vision_transformer(pretrained=False, **kwargs):
    model = VisionTransformerCE(**kwargs)
    return model


 def vit_base_patch16_224_ce(pretrained=False, **kwargs):
    """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
    """
    model_kwargs = dict(
        patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
    model = _create_vision_transformer(pretrained=pretrained, **model_kwargs)
    return model
--- a/modelscope/models/cv/video_single_object_tracking/tracker/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/init.py
--- a/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
+++ b/modelscope/models/cv/video_single_object_tracking/tracker/ostrack.py
@@ -0,0 +1,139 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import torch

 from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
    cfg
 from modelscope.models.cv.video_single_object_tracking.models.ostrack.ostrack import \
    build_ostrack
 from modelscope.models.cv.video_single_object_tracking.utils.utils import (
    Preprocessor, clip_box, generate_mask_cond, hann2d, sample_target,
    transform_image_to_crop)


 class OSTrack():

    def __init__(self, ckpt_path, device):
        network = build_ostrack(cfg)
        network.load_state_dict(
            torch.load(ckpt_path, map_location='cpu')['net'], strict=True)
        self.cfg = cfg
        if device.type == 'cuda':
            self.network = network.to(device)
        else:
            self.network = network
        self.network.eval()
        self.preprocessor = Preprocessor(device)
        self.state = None

        self.feat_sz = self.cfg.TEST.SEARCH_SIZE // self.cfg.MODEL.BACKBONE.STRIDE
        # motion constrain
        if device.type == 'cuda':
            self.output_window = hann2d(
                torch.tensor([self.feat_sz, self.feat_sz]).long(),
                centered=True).to(device)
        else:
            self.output_window = hann2d(
                torch.tensor([self.feat_sz, self.feat_sz]).long(),
                centered=True)
        self.frame_id = 0
        # for save boxes from all queries
        self.z_dict1 = {}

    def initialize(self, image, info: dict):
        # forward the template once
        z_patch_arr, resize_factor, z_amask_arr = sample_target(
            image,
            info['init_bbox'],
            self.cfg.TEST.TEMPLATE_FACTOR,
            output_sz=self.cfg.TEST.TEMPLATE_SIZE)
        self.z_patch_arr = z_patch_arr
        template = self.preprocessor.process(z_patch_arr, z_amask_arr)
        with torch.no_grad():
            self.z_dict1 = template

        self.box_mask_z = None
        if self.cfg.MODEL.BACKBONE.CE_LOC:
            template_bbox = self.transform_bbox_to_crop(
                info['init_bbox'], resize_factor,
                template.tensors.device).squeeze(1)
            self.box_mask_z = generate_mask_cond(self.cfg, 1,
                                                 template.tensors.device,
                                                 template_bbox)

        # save states
        self.state = info['init_bbox']
        self.frame_id = 0

    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor, x_amask_arr = sample_target(
            image,
            self.state,
            self.cfg.TEST.SEARCH_FACTOR,
            output_sz=self.cfg.TEST.SEARCH_SIZE)  # (x1, y1, w, h)
        search = self.preprocessor.process(x_patch_arr, x_amask_arr)

        with torch.no_grad():
            x_dict = search
            # merge the template and the search
            # run the transformer
            out_dict = self.network.forward(
                template=self.z_dict1.tensors,
                search=x_dict.tensors,
                ce_template_mask=self.box_mask_z)

        # add hann windows
        pred_score_map = out_dict['score_map']
        response = self.output_window * pred_score_map
        pred_boxes = self.network.box_head.cal_bbox(response,
                                                    out_dict['size_map'],
                                                    out_dict['offset_map'])
        pred_boxes = pred_boxes.view(-1, 4)
        # Baseline: Take the mean of all pred boxes as the final result
        pred_box = (pred_boxes.mean(dim=0) * self.cfg.TEST.SEARCH_SIZE
                    / resize_factor).tolist()  # (cx, cy, w, h) [0,1]
        # get the final box result
        self.state = clip_box(
            self.map_box_back(pred_box, resize_factor), H, W, margin=10)

        x1, y1, w, h = self.state
        x2 = x1 + w
        y2 = y1 + h
        return {'target_bbox': [x1, y1, x2, y2]}

    def map_box_back(self, pred_box: list, resize_factor: float):
        cx_prev, cy_prev = self.state[0] + 0.5 * self.state[2], self.state[
            1] + 0.5 * self.state[3]
        cx, cy, w, h = pred_box
        half_side = 0.5 * self.cfg.TEST.SEARCH_SIZE / resize_factor
        cx_real = cx + (cx_prev - half_side)
        cy_real = cy + (cy_prev - half_side)
        return [cx_real - 0.5 * w, cy_real - 0.5 * h, w, h]

    def transform_bbox_to_crop(self,
                               box_in,
                               resize_factor,
                               device,
                               box_extract=None,
                               crop_type='template'):
        if crop_type == 'template':
            crop_sz = torch.Tensor(
                [self.cfg.TEST.TEMPLATE_SIZE, self.cfg.TEST.TEMPLATE_SIZE])
        elif crop_type == 'search':
            crop_sz = torch.Tensor(
                [self.cfg.TEST.SEARCH_SIZE, self.cfg.TEST.SEARCH_SIZE])
        else:
            raise NotImplementedError

        box_in = torch.tensor(box_in)
        if box_extract is None:
            box_extract = box_in
        else:
            box_extract = torch.tensor(box_extract)
        template_bbox = transform_image_to_crop(
            box_in, box_extract, resize_factor, crop_sz, normalize=True)
        template_bbox = template_bbox.view(1, 1, 4).to(device)

        return template_bbox
--- a/modelscope/models/cv/video_single_object_tracking/utils/init.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/init.py
--- a/modelscope/models/cv/video_single_object_tracking/utils/utils.py
+++ b/modelscope/models/cv/video_single_object_tracking/utils/utils.py
@@ -0,0 +1,261 @@
 # The implementation is also open-sourced by the authors as OSTrack, and is available publicly on
 # https://github.com/botaoye/OSTrack/
 import math
 from typing import Optional

 import cv2
 import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import Tensor


 def hann1d(sz: int, centered=True) -> torch.Tensor:
    """1D cosine window."""
    if centered:
        return 0.5 * (1 - torch.cos(
            (2 * math.pi / (sz + 1)) * torch.arange(1, sz + 1).float()))
    w = 0.5 * (1 + torch.cos(
        (2 * math.pi / (sz + 2)) * torch.arange(0, sz // 2 + 1).float()))
    return torch.cat([w, w[1:sz - sz // 2].flip((0, ))])


 def hann2d(sz: torch.Tensor, centered=True) -> torch.Tensor:
    """2D cosine window."""
    return hann1d(sz[0].item(), centered).reshape(1, 1, -1, 1) * hann1d(
        sz[1].item(), centered).reshape(1, 1, 1, -1)


 class NestedTensor(object):

    def __init__(self, tensors, mask: Optional[Tensor]):
        self.tensors = tensors
        self.mask = mask


 class Preprocessor(object):

    def __init__(self, device: str):
        self.device = device
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view((1, 3, 1, 1))
        self.std = torch.tensor([0.229, 0.224, 0.225]).view((1, 3, 1, 1))
        if 'cuda' == self.device.type:
            self.mean = self.mean.to(self.device)
            self.std = self.std.to(self.device)

    def process(self, img_arr: np.ndarray, amask_arr: np.ndarray):
        # Deal with the image patch
        if 'cuda' == self.device.type:
            img_tensor = torch.tensor(img_arr).to(self.device).float().permute(
                (2, 0, 1)).unsqueeze(dim=0)
        else:
            img_tensor = torch.tensor(img_arr).float().permute(
                (2, 0, 1)).unsqueeze(dim=0)
        img_tensor_norm = (
            (img_tensor / 255.0) - self.mean) / self.std  # (1,3,H,W)

        # Deal with the attention mask
        if 'cuda' == self.device.type:
            amask_tensor = torch.from_numpy(amask_arr).to(torch.bool).to(
                self.device).unsqueeze(dim=0)  # (1,H,W)
        else:
            amask_tensor = torch.from_numpy(amask_arr).to(
                torch.bool).unsqueeze(dim=0)  # (1,H,W)
        return NestedTensor(img_tensor_norm, amask_tensor)


 def clip_box(box: list, H, W, margin=0):
    x1, y1, w, h = box
    x2, y2 = x1 + w, y1 + h
    x1 = min(max(0, x1), W - margin)
    x2 = min(max(margin, x2), W)
    y1 = min(max(0, y1), H - margin)
    y2 = min(max(margin, y2), H)
    w = max(margin, x2 - x1)
    h = max(margin, y2 - y1)
    if isinstance(x1, torch.Tensor):
        x1 = x1.item()
        y1 = y1.item()
        w = w.item()
        h = h.item()
    return [x1, y1, w, h]


 def generate_mask_cond(cfg, bs, device, gt_bbox):
    template_size = cfg.DATA.TEMPLATE.SIZE
    stride = cfg.MODEL.BACKBONE.STRIDE
    template_feat_size = template_size // stride

    if cfg.MODEL.BACKBONE.CE_TEMPLATE_RANGE == 'CTR_POINT':
        if template_feat_size == 8:
            index = slice(3, 4)
        elif template_feat_size == 12:
            index = slice(5, 6)
        elif template_feat_size == 7:
            index = slice(3, 4)
        elif template_feat_size == 14:
            index = slice(6, 7)
        else:
            raise NotImplementedError
        box_mask_z = torch.zeros([bs, template_feat_size, template_feat_size],
                                 device=device)
        box_mask_z[:, index, index] = 1
        box_mask_z = box_mask_z.flatten(1).to(torch.bool)
    else:
        raise NotImplementedError

    return box_mask_z


 def sample_target(im,
                  target_bb,
                  search_area_factor,
                  output_sz=None,
                  mask=None):
    """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area

    args:
        im - cv image
        target_bb - target box [x, y, w, h]
        search_area_factor - Ratio of crop size to target size
        output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.

    returns:
        cv image - extracted crop
        float - the factor by which the crop has been resized to make the crop size equal output_size
    """
    if not isinstance(target_bb, list):
        x, y, w, h = target_bb.tolist()
    else:
        x, y, w, h = target_bb
    # Crop image
    crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)

    if crop_sz < 1:
        raise Exception('Too small bounding box.')

    x1 = round(x + 0.5 * w - crop_sz * 0.5)
    x2 = x1 + crop_sz

    y1 = round(y + 0.5 * h - crop_sz * 0.5)
    y2 = y1 + crop_sz

    x1_pad = max(0, -x1)
    x2_pad = max(x2 - im.shape[1] + 1, 0)

    y1_pad = max(0, -y1)
    y2_pad = max(y2 - im.shape[0] + 1, 0)

    # Crop target
    im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
    if mask is not None:
        mask_crop = mask[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad]

    # Pad
    im_crop_padded = cv2.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad,
                                        x2_pad, cv2.BORDER_CONSTANT)
    # deal with attention mask
    H, W, _ = im_crop_padded.shape
    att_mask = np.ones((H, W))
    end_x, end_y = -x2_pad, -y2_pad
    if y2_pad == 0:
        end_y = None
    if x2_pad == 0:
        end_x = None
    att_mask[y1_pad:end_y, x1_pad:end_x] = 0
    if mask is not None:
        mask_crop_padded = F.pad(
            mask_crop,
            pad=(x1_pad, x2_pad, y1_pad, y2_pad),
            mode='constant',
            value=0)

    if output_sz is not None:
        resize_factor = output_sz / crop_sz
        im_crop_padded = cv2.resize(im_crop_padded, (output_sz, output_sz))
        att_mask = cv2.resize(att_mask,
                              (output_sz, output_sz)).astype(np.bool_)
        if mask is None:
            return im_crop_padded, resize_factor, att_mask
        mask_crop_padded = \
            F.interpolate(mask_crop_padded[None, None], (output_sz, output_sz),
                          mode='bilinear', align_corners=False)[0, 0]
        return im_crop_padded, resize_factor, att_mask, mask_crop_padded

    else:
        if mask is None:
            return im_crop_padded, att_mask.astype(np.bool_), 1.0
        return im_crop_padded, 1.0, att_mask.astype(np.bool_), mask_crop_padded


 def transform_image_to_crop(box_in: torch.Tensor,
                            box_extract: torch.Tensor,
                            resize_factor: float,
                            crop_sz: torch.Tensor,
                            normalize=False) -> torch.Tensor:
    """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image
    args:
        box_in - the box for which the co-ordinates are to be transformed
        box_extract - the box about which the image crop has been extracted.
        resize_factor - the ratio between the original image scale and the scale of the image crop
        crop_sz - size of the cropped image

    returns:
        torch.Tensor - transformed co-ordinates of box_in
    """
    box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]

    box_in_center = box_in[0:2] + 0.5 * box_in[2:4]

    box_out_center = (crop_sz - 1) / 2 + (box_in_center
                                          - box_extract_center) * resize_factor
    box_out_wh = box_in[2:4] * resize_factor

    box_out = torch.cat((box_out_center - 0.5 * box_out_wh, box_out_wh))
    if normalize:
        return box_out / crop_sz[0]
    else:
        return box_out


 def check_box(box: list, image_height, image_width) -> bool:
    """ To check whether the box is within the image range or not
    args:
        box - the bounding box in the form of [x1, y1, x2, y2]
        image_height - the height of the image
        image_width - the width of the image

    returns:
        bool - if box is valid, return True. Otherwise, return False
    """
    assert len(box) == 4, 'box must be in the form of: [x1, y1, x2, y2]'
    if box[0] < 0 or box[0] >= image_width:
        return False
    if box[2] < 0 or box[2] >= image_width:
        return False
    if box[1] < 0 or box[1] >= image_height:
        return False
    if box[3] < 0 or box[3] >= image_height:
        return False
    return True


 def show_tracking_result(video_in_path, bboxes, video_save_path):
    cap = cv2.VideoCapture(video_in_path)
    for i in range(len(bboxes)):
        box = bboxes[i]
        success, frame = cap.read()
        if success is False:
            raise Exception(video_in_path,
                            ' can not be correctly decoded by OpenCV.')
        if i == 0:
            size = (frame.shape[1], frame.shape[0])
            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
            video_writer = cv2.VideoWriter(video_save_path, fourcc,
                                           cap.get(cv2.CAP_PROP_FPS), size,
                                           True)
        cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0),
                      5)
        video_writer.write(frame)
    video_writer.release
    cap.release()
--- a/modelscope/models/multi_modal/init.py
+++ b/modelscope/models/multi_modal/init.py
@@ -9,9 +9,10 @@ if TYPE_CHECKING:
    from .gemm import GEMMForMultiModalEmbedding
    from .diffusion import DiffusionForTextToImageSynthesis
    from .mmr import VideoCLIPForMultiModalEmbedding
    from .mplug_for_visual_question_answering import \
        MPlugForVisualQuestionAnswering
    from .mplug_for_all_tasks import MPlugForAllTasks
    from .ofa_for_all_tasks import OfaForAllTasks
    from .ofa_for_text_to_image_synthesis_model import \
        OfaForTextToImageSynthesis

 else:
    _import_structure = {
@@ -19,8 +20,7 @@ else:
        'diffusion': ['DiffusionForTextToImageSynthesis'],
        'gemm': ['GEMMForMultiModalEmbedding'],
        'mmr': ['VideoCLIPForMultiModalEmbedding'],
        'mplug_for_visual_question_answering':
        ['MPlugForVisualQuestionAnswering'],
        'mplug_for_all_tasks': ['MPlugForAllTasks'],
        'ofa_for_all_tasks': ['OfaForAllTasks'],
        'ofa_for_text_to_image_synthesis_model':
        ['OfaForTextToImageSynthesis']
--- a/modelscope/models/multi_modal/clip/init.py
+++ b/modelscope/models/multi_modal/clip/init.py
@@ -1 +1 @@
 from .clip_model import CLIPForMultiModalEmbedding
 from .model import CLIPForMultiModalEmbedding
--- a/modelscope/models/multi_modal/clip/bert_tokenizer.py
+++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py
@@ -0,0 +1,422 @@
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes."""

 from __future__ import absolute_import, division, print_function
 import collections
 import os
 import re
 import unicodedata

 import six


 def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
    """Checks whether the casing config is consistent with the checkpoint name."""

    # The casing has to be passed in by the user and there is no explicit check
    # as to whether it matches the checkpoint. The casing information probably
    # should have been stored in the bert_config.json file, but it's not, so
    # we have to heuristically detect it to validate.

    if not init_checkpoint:
        return

    m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint)
    if m is None:
        return

    model_name = m.group(1)

    lower_models = [
        'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12',
        'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12'
    ]

    cased_models = [
        'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16',
        'multi_cased_L-12_H-768_A-12'
    ]

    is_bad_config = False
    if model_name in lower_models and not do_lower_case:
        is_bad_config = True
        actual_flag = 'False'
        case_name = 'lowercased'
        opposite_flag = 'True'

    if model_name in cased_models and do_lower_case:
        is_bad_config = True
        actual_flag = 'True'
        case_name = 'cased'
        opposite_flag = 'False'

    if is_bad_config:
        raise ValueError(
            'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. '
            'However, `%s` seems to be a %s model, so you '
            'should pass in `--do_lower_case=%s` so that the fine-tuning matches '
            'how the model was pre-training. If this error is wrong, please '
            'just comment out this check.' %
            (actual_flag, init_checkpoint, model_name, case_name,
             opposite_flag))


 def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode('utf-8', 'ignore')
        else:
            raise ValueError('Unsupported string type: %s' % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text.decode('utf-8', 'ignore')
        elif isinstance(text, unicode):
            return text
        else:
            raise ValueError('Unsupported string type: %s' % (type(text)))
    else:
        raise ValueError('Not running on Python2 or Python 3?')


 def printable_text(text):
    """Returns text encoded in a way suitable for print or `tf.logging`."""

    # These functions want `str` for both Python2 and Python3, but in one case
    # it's a Unicode string and in the other it's a byte string.
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, bytes):
            return text.decode('utf-8', 'ignore')
        else:
            raise ValueError('Unsupported string type: %s' % (type(text)))
    elif six.PY2:
        if isinstance(text, str):
            return text
        elif isinstance(text, unicode):
            return text.encode('utf-8')
        else:
            raise ValueError('Unsupported string type: %s' % (type(text)))
    else:
        raise ValueError('Not running on Python2 or Python 3?')


 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, 'r') as reader:
        while True:
            token = convert_to_unicode(reader.readline())
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab


 def convert_by_vocab(vocab, items):
    """Converts a sequence of [tokens|ids] using the vocab."""
    output = []
    for item in items:
        output.append(vocab[item])
    return output


 def convert_tokens_to_ids(vocab, tokens):
    return convert_by_vocab(vocab, tokens)


 def convert_ids_to_tokens(inv_vocab, ids):
    return convert_by_vocab(inv_vocab, ids)


 def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens


 class FullTokenizer(object):
    """Runs end-to-end tokenziation."""

    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)

        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        return convert_by_vocab(self.vocab, tokens)

    def convert_ids_to_tokens(self, ids):
        return convert_by_vocab(self.inv_vocab, ids)

    @staticmethod
    def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
        """ Converts a sequence of tokens (string) in a single string. """

        def clean_up_tokenization(out_string):
            """ Clean up a list of simple English tokenization artifacts
            like spaces before punctuations and abreviated forms.
            """
            out_string = (
                out_string.replace(' .', '.').replace(' ?', '?').replace(
                    ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace(
                        " n't", "n't").replace(" 'm", "'m").replace(
                            " 's", "'s").replace(" 've",
                                                 "'ve").replace(" 're", "'re"))
            return out_string

        text = ' '.join(tokens).replace(' ##', '').strip()
        if clean_up_tokenization_spaces:
            clean_text = clean_up_tokenization(text)
            return clean_text
        else:
            return text

    def vocab_size(self):
        return len(self.vocab)


 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=True):
        """Constructs a BasicTokenizer.

        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)

        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)

        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        output_tokens = whitespace_tokenize(' '.join(split_tokens))
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize('NFD', text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == 'Mn':
                continue
            output.append(char)
        return ''.join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return [''.join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(' ')
                output.append(char)
                output.append(' ')
            else:
                output.append(char)
        return ''.join(output)

    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
                or (cp >= 0x20000 and cp <= 0x2A6DF)
                or (cp >= 0x2A700 and cp <= 0x2B73F)
                or (cp >= 0x2B740 and cp <= 0x2B81F)
                or (cp >= 0x2B820 and cp <= 0x2CEAF)
                or (cp >= 0xF900 and cp <= 0xFAFF)
                or (cp >= 0x2F800 and cp <= 0x2FA1F)):
            return True

        return False

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(' ')
            else:
                output.append(char)
        return ''.join(output)


 class WordpieceTokenizer(object):
    """Runs WordPiece tokenziation."""

    def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.

        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.

        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.

        Returns:
          A list of wordpiece tokens.
        """

        text = convert_to_unicode(text)

        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = ''.join(chars[start:end])
                    if start > 0:
                        substr = '##' + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens


 def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == ' ' or char == '\t' or char == '\n' or char == '\r':
        return True
    cat = unicodedata.category(char)
    if cat == 'Zs':
        return True
    return False


 def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == '\t' or char == '\n' or char == '\r':
        return False
    cat = unicodedata.category(char)
    if cat in ('Cc', 'Cf'):
        return True
    return False


 def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith('P'):
        return True
    return False
--- a/modelscope/models/multi_modal/clip/clip_bert.py
+++ b/modelscope/models/multi_modal/clip/clip_bert.py
@@ -1,29 +0,0 @@
 import torch.nn as nn
 from transformers import BertConfig, BertForMaskedLM


 class TextTransformer(nn.Module):

    def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True):
        super(TextTransformer, self).__init__()
        bert_config = BertConfig.from_dict(config_dict)
        if use_grad_ckp:
            bert_config.gradient_checkpointing = True

        self.bert = BertForMaskedLM(bert_config).bert

        self.projector = nn.Linear(
            bert_config.hidden_size, feat_dim, bias=False)

    def forward(self, input_ids, attention_mask):
        trans_features = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

        output_states = self.bert(**trans_features, return_dict=False)
        output_tokens = output_states[0]

        cls_tokens = output_tokens[:, 0, :]

        return self.projector(cls_tokens)
--- a/modelscope/models/multi_modal/clip/clip_model.py
+++ b/modelscope/models/multi_modal/clip/clip_model.py
@@ -1,216 +0,0 @@
 from typing import Any, Dict

 import cv2
 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
 from tokenizers import BertWordPieceTokenizer
 from torch.distributed.nn.functional import \
    all_gather as all_gather_with_backprop
 from torchvision.transforms import Compose, Normalize, Resize, ToTensor

 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.multi_modal.clip.clip_bert import TextTransformer
 from modelscope.models.multi_modal.clip.clip_vit import VisionTransformer
 from modelscope.utils.constant import ModeKeys, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 __all__ = ['CLIPForMultiModalEmbedding']


 class CLIPModel(nn.Module):

    def __init__(self, model_dir):
        super(CLIPModel, self).__init__()
        # including vision config and text config
        model_config = json.load(
            open('{}/encoder_config.json'.format(model_dir)))

        # vision encoder
        vision_config = model_config['vision_config']
        self.img_size = vision_config['input_resolution']
        self.vision_encoder = VisionTransformer(
            input_resolution=self.img_size,
            patch_size=vision_config['patch_size'],
            width=vision_config['width'],
            layers=vision_config['layers'],
            heads=vision_config['heads'],
            output_dim=vision_config['feat_dim'],
            use_grad_ckp=True)

        # text encoder
        text_config = model_config['text_config']
        self.text_encoder = TextTransformer(
            text_config['bert_config'], feat_dim=text_config['feat_dim'])

        self.logit_scale = nn.Parameter(torch.ones([]) * 4.6)

    def contrastive_loss(self, logits, dim):
        neg_ce = torch.diag(F.log_softmax(logits, dim=dim))
        return -neg_ce.mean()

    def clip_loss(self, t2i_sim, i2t_sim, img_idx=None, all_img_idx=None):
        if img_idx is not None and all_img_idx is not None:
            with torch.no_grad():
                false_neg_indicator = (
                    img_idx[:, None] == all_img_idx[None, :])
                false_neg_indicator.fill_diagonal_(False)
            t2i_sim.masked_fill_(false_neg_indicator, float('-inf'))
            i2t_sim.masked_fill_(false_neg_indicator, float('-inf'))
            caption_loss = self.contrastive_loss(t2i_sim, dim=1)
            image_loss = self.contrastive_loss(i2t_sim, dim=1)
        else:
            caption_loss = self.contrastive_loss(t2i_sim, dim=1)
            image_loss = self.contrastive_loss(i2t_sim, dim=1)
        return (caption_loss + image_loss) / 2.0

    def get_loss(self, img_tensor, text_ids_tensor, text_masks_tensor,
                 img_id_list):
        img_feat = self.forward(img_tensor, input_type='img')
        text_feat = self.forward((text_ids_tensor, text_masks_tensor),
                                 input_type='text')

        global_img_feat = torch.cat(all_gather_with_backprop(img_feat), dim=0)
        global_text_feat = torch.cat(
            all_gather_with_backprop(text_feat), dim=0)
        global_img_id_list = torch.cat(
            all_gather_with_backprop(img_id_list), dim=0)

        t2i_sim_mat = text_feat @ global_img_feat.t()
        i2t_sim_mat = img_feat @ global_text_feat.t()

        logit_scale = self.logit_scale.exp().clamp(max=100.0)
        t2i_sim_mat_logits = t2i_sim_mat * logit_scale
        i2t_sim_mat_logits = i2t_sim_mat * logit_scale

        loss = self.clip_loss(
            t2i_sim_mat_logits,
            i2t_sim_mat_logits,
            img_idx=img_id_list,
            all_img_idx=global_img_id_list)

        return loss

    def forward(self, input_data, input_type):
        if input_type == 'img':
            img_embedding = self.vision_encoder(input_data)
            img_embedding = F.normalize(img_embedding, p=2.0, dim=1)
            return img_embedding
        elif input_type == 'text':
            text_ids_tensor, text_mask_tensor = input_data
            text_embedding = self.text_encoder(text_ids_tensor,
                                               text_mask_tensor)
            text_embedding = F.normalize(text_embedding, p=2.0, dim=1)
            return text_embedding
        elif input_type == ModeKeys.TRAIN:
            return self.get_loss(*input_data)
        else:
            raise ValueError('Unknown input type')


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
 class CLIPForMultiModalEmbedding(TorchModel):

    def __init__(self, model_dir, device_id=-1):
        super().__init__(model_dir=model_dir, device_id=device_id)
        self.clip_model = CLIPModel(model_dir=model_dir)
        pretrained_params = torch.load(
            '{}/pytorch_model.bin'.format(model_dir), 'cpu')
        self.clip_model.load_state_dict(pretrained_params)
        self.clip_model.eval()

        self.device_id = device_id
        if self.device_id >= 0:
            self.clip_model.to('cuda:{}'.format(self.device_id))
            logger.info('Use GPU: {}'.format(self.device_id))
        else:
            logger.info('Use CPU for inference')

        # image preprocessor
        norm_op = Normalize((0.48145466, 0.4578275, 0.40821073),
                            (0.26862954, 0.26130258, 0.27577711))
        self.img_preprocessor = Compose([
            Resize((self.clip_model.img_size, self.clip_model.img_size),
                   interpolation=Image.BICUBIC),
            ToTensor(), norm_op
        ])

        # text tokenizer
        vocab_path = '{}/vocab.txt'.format(model_dir)
        self.text_tokenizer = BertWordPieceTokenizer(
            vocab_path, lowercase=False)
        self.text_tokenizer.enable_truncation(max_length=30)

    def tokenize_text(self, text_str):
        tokens = self.text_tokenizer.encode(text_str)
        max_tokens = 30
        text_ids_tensor = torch.zeros((1, max_tokens)).long()
        text_mask_tensor = torch.zeros((1, max_tokens))

        text_ids, text_mask = tokens.ids, tokens.attention_mask
        text_ids_tensor[0, 0:len(text_ids)] = torch.tensor(text_ids)
        text_mask_tensor[0, 0:len(text_mask)] = torch.tensor(text_mask)

        return text_ids_tensor, text_mask_tensor

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        from modelscope.outputs import OutputKeys
        output = {
            OutputKeys.IMG_EMBEDDING: None,
            OutputKeys.TEXT_EMBEDDING: None
        }
        if 'img' in input and input['img'] is not None:
            input_img = input['img']
            if isinstance(input_img, Image.Image):
                img_tensor = self.img_preprocessor(input_img)[None, ...]
            elif isinstance(input_img, np.ndarray):
                if len(input_img.shape) == 2:
                    input_img = cv2.cvtColor(input_img, cv2.COLOR_GRAY2BGR)
                input_img = input_img[:, :, ::-1]  # in rgb order
                input_img = Image.fromarray(
                    input_img.astype('uint8')).convert('RGB')
                img_tensor = self.img_preprocessor(input_img)[None, ...]
            else:
                raise TypeError(
                    f'img should be either PIL.Image or np.array, but got {type(input_img)}'
                )

            if self.device_id >= 0:
                img_tensor = img_tensor.to('cuda:{}'.format(self.device_id))

            img_embedding = self.clip_model(
                input_data=img_tensor, input_type='img')
            from modelscope.outputs import OutputKeys
            output[OutputKeys.IMG_EMBEDDING] = img_embedding.data.cpu().numpy()

        if 'text' in input and input['text'] is not None:
            text_str = input['text']
            if isinstance(text_str, str):
                text_ids_tensor, text_mask_tensor = self.tokenize_text(
                    text_str)
            else:
                raise TypeError(
                    f'text should be str, but got {type(text_str)}')

            if self.device_id >= 0:
                text_ids_tensor = text_ids_tensor.to('cuda:{}'.format(
                    self.device_id))
                text_mask_tensor = text_mask_tensor.to('cuda:{}'.format(
                    self.device_id))

            text_embedding = self.clip_model(
                input_data=(text_ids_tensor, text_mask_tensor),
                input_type='text')
            output['text_embedding'] = text_embedding.data.cpu().numpy()

        return output

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/models/multi_modal/clip/clip_vit.py
+++ b/modelscope/models/multi_modal/clip/clip_vit.py
@@ -1,131 +0,0 @@
 # Copyright 2021 The OpenAI CLIP Authors. All rights reserved.

 from collections import OrderedDict
 from typing import Tuple, Union

 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from torch import nn


 class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


 class QuickGELU(nn.Module):

    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


 class ResidualAttentionBlock(nn.Module):

    def __init__(self,
                 d_model: int,
                 n_head: int,
                 attn_mask: torch.Tensor = None):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(
            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
                         ('gelu', QuickGELU()),
                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(
            dtype=x.dtype,
            device=x.device) if self.attn_mask is not None else None
        return self.attn(
            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


 class Transformer(nn.Module):

    def __init__(self,
                 width: int,
                 layers: int,
                 heads: int,
                 attn_mask: torch.Tensor = None,
                 use_grad_ckp: bool = True):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[
            ResidualAttentionBlock(width, heads, attn_mask)
            for _ in range(layers)
        ])

        self.use_grad_ckp = use_grad_ckp

    def forward(self, x: torch.Tensor):
        if self.use_grad_ckp:
            for each_block in self.resblocks:
                x = checkpoint.checkpoint(each_block, x)
            return x
        else:
            return self.resblocks(x)


 class VisionTransformer(nn.Module):

    def __init__(self, input_resolution: int, patch_size: int, width: int,
                 layers: int, heads: int, output_dim: int, use_grad_ckp: bool):
        super().__init__()
        self.input_resolution = input_resolution
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=width,
            kernel_size=patch_size,
            stride=patch_size,
            bias=False)

        scale = width**-0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn(
            (input_resolution // patch_size)**2 + 1, width))
        self.ln_pre = LayerNorm(width)

        self.transformer = Transformer(
            width, layers, heads, use_grad_ckp=use_grad_ckp)

        self.ln_post = LayerNorm(width)
        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

    def forward(self, x: torch.Tensor):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        x = x.reshape(x.shape[0], x.shape[1],
                      -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        class_embeddings = self.class_embedding.to(x.dtype) + \
            torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
        x = torch.cat([class_embeddings, x], dim=1)
        x = x + self.positional_embedding.to(x.dtype)
        x = self.ln_pre(x)

        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD

        x = self.ln_post(x[:, 0, :])

        if self.proj is not None:
            x = x @ self.proj

        return x
--- a/modelscope/models/multi_modal/clip/configuration_bert.py
+++ b/modelscope/models/multi_modal/clip/configuration_bert.py
@@ -0,0 +1,82 @@
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ BERT model configuration """

 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import logging

 logger = logging.getLogger(__name__)


 class BertConfig(object):
    r"""
        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
        `BertModel`.


        Arguments:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `BertModel`.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
            layer_norm_eps: The epsilon used by LayerNorm.
    """

    def __init__(self,
                 vocab_size_or_config_json_file=30522,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act='gelu',
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 layer_norm_eps=1e-12,
                 output_attentions=False,
                 output_hidden_states=False):
        self.vocab_size = vocab_size_or_config_json_file
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.output_attentions = output_attentions
        self.output_hidden_states = output_hidden_states
--- a/modelscope/models/multi_modal/clip/model.py
+++ b/modelscope/models/multi_modal/clip/model.py
@@ -0,0 +1,677 @@
 import os
 from collections import OrderedDict
 from typing import Any, Dict, Iterable, List, Tuple, Union

 import json
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
 from torchvision.transforms import Compose, Normalize, Resize, ToTensor

 from modelscope.metainfo import Models
 from modelscope.models import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer
 from modelscope.models.multi_modal.clip.configuration_bert import BertConfig
 from modelscope.models.multi_modal.clip.modeling_bert import BertModel
 from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 __all__ = ['CLIPForMultiModalEmbedding']


 class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1):
        super().__init__()

        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)

        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()

        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = None
        self.stride = stride

        if stride > 1 or inplanes != planes * Bottleneck.expansion:
            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
            self.downsample = nn.Sequential(
                OrderedDict([('-1', nn.AvgPool2d(stride)),
                             ('0',
                              nn.Conv2d(
                                  inplanes,
                                  planes * self.expansion,
                                  1,
                                  stride=1,
                                  bias=False)),
                             ('1', nn.BatchNorm2d(planes * self.expansion))]))

    def forward(self, x: torch.Tensor):
        identity = x

        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.avgpool(out)
        out = self.bn3(self.conv3(out))

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return out


 class AttentionPool2d(nn.Module):

    def __init__(self,
                 spacial_dim: int,
                 embed_dim: int,
                 num_heads: int,
                 output_dim: int = None):
        super().__init__()
        self.positional_embedding = nn.Parameter(
            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
        self.num_heads = num_heads

    def forward(self, x):
        x = x.reshape(x.shape[0], x.shape[1],
                      x.shape[2] * x.shape[3]).permute(2, 0,
                                                       1)  # NCHW -> (HW)NC
        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
        x, _ = F.multi_head_attention_forward(
            query=x,
            key=x,
            value=x,
            embed_dim_to_check=x.shape[-1],
            num_heads=self.num_heads,
            q_proj_weight=self.q_proj.weight,
            k_proj_weight=self.k_proj.weight,
            v_proj_weight=self.v_proj.weight,
            in_proj_weight=None,
            in_proj_bias=torch.cat(
                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
            bias_k=None,
            bias_v=None,
            add_zero_attn=False,
            dropout_p=0,
            out_proj_weight=self.c_proj.weight,
            out_proj_bias=self.c_proj.bias,
            use_separate_proj_weight=True,
            training=self.training,
            need_weights=False)

        return x[0]


 class ModifiedResNet(nn.Module):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self,
                 layers,
                 output_dim,
                 heads,
                 input_resolution=224,
                 width=64):
        super().__init__()
        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv2d(
            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width // 2)
        self.conv2 = nn.Conv2d(
            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(width // 2)
        self.conv3 = nn.Conv2d(
            width // 2, width, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(width)
        self.avgpool = nn.AvgPool2d(2)
        self.relu = nn.ReLU(inplace=True)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

        embed_dim = width * 32  # the ResNet feature dimension
        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
                                        heads, output_dim)

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):

        def stem(x):
            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2),
                             (self.conv3, self.bn3)]:
                x = self.relu(bn(conv(x)))
            x = self.avgpool(x)
            return x

        x = x.type(self.conv1.weight.dtype)
        x = stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.attnpool(x)

        return x


 class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


 class QuickGELU(nn.Module):

    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


 class ResidualAttentionBlock(nn.Module):

    def __init__(self,
                 d_model: int,
                 n_head: int,
                 attn_mask: torch.Tensor = None):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(
            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
                         ('gelu', QuickGELU()),
                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(
            dtype=x.dtype,
            device=x.device) if self.attn_mask is not None else None
        return self.attn(
            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


 class Transformer(nn.Module):

    def __init__(self,
                 width: int,
                 layers: int,
                 heads: int,
                 attn_mask: torch.Tensor = None):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[
            ResidualAttentionBlock(width, heads, attn_mask)
            for _ in range(layers)
        ])

    def forward(self, x: torch.Tensor):
        return self.resblocks(x)


 class VisualTransformer(nn.Module):

    def __init__(self, input_resolution: int, patch_size: int, width: int,
                 layers: int, heads: int, output_dim: int):
        super().__init__()
        self.input_resolution = input_resolution
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(
            in_channels=3,
            out_channels=width,
            kernel_size=patch_size,
            stride=patch_size,
            bias=False)

        scale = width**-0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn(
            (input_resolution // patch_size)**2 + 1, width))
        self.ln_pre = LayerNorm(width)

        self.transformer = Transformer(width, layers, heads)

        self.ln_post = LayerNorm(width)
        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

    def forward(self, x: torch.Tensor):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        x = x.reshape(x.shape[0], x.shape[1],
                      -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x = torch.cat(
            [  # noqa
                self.class_embedding.to(x.dtype) + torch.zeros(  # noqa
                    x.shape[0],
                    1,
                    x.shape[-1],
                    dtype=x.dtype,
                    device=x.device),
                x  # noqa
            ],
            dim=1)  # noqa shape = [*, grid ** 2 + 1, width]
        x = x + self.positional_embedding.to(x.dtype)
        x = self.ln_pre(x)

        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD

        x = self.ln_post(x[:, 0, :])

        if self.proj is not None:
            x = x @ self.proj

        return x


 class CLIP(nn.Module):

    def __init__(
        self,
        embed_dim: int,
        # vision
        image_resolution: int,
        vision_layers: Union[Tuple[int, int, int, int], int],
        vision_width: int,
        vision_patch_size: int,
        # text
        vocab_size: int,
        text_attention_probs_dropout_prob: float,
        text_hidden_act: str,
        text_hidden_dropout_prob: float,
        text_hidden_size: int,
        text_initializer_range: float,
        text_intermediate_size: int,
        text_max_position_embeddings: int,
        text_num_attention_heads: int,
        text_num_hidden_layers: int,
        text_type_vocab_size: int,
        tokenizer: FullTokenizer,
    ):
        super().__init__()

        if isinstance(vision_layers, (tuple, list)):
            vision_heads = vision_width * 32 // 64
            self.visual = ModifiedResNet(
                layers=vision_layers,
                output_dim=embed_dim,
                heads=vision_heads,
                input_resolution=image_resolution,
                width=vision_width)
        else:
            vision_heads = vision_width // 64
            self.visual = VisualTransformer(
                input_resolution=image_resolution,
                patch_size=vision_patch_size,
                width=vision_width,
                layers=vision_layers,
                heads=vision_heads,
                output_dim=embed_dim)

        self.bert_config = BertConfig(
            vocab_size_or_config_json_file=vocab_size,
            hidden_size=text_hidden_size,
            num_hidden_layers=text_num_hidden_layers,
            num_attention_heads=text_num_attention_heads,
            intermediate_size=text_intermediate_size,
            hidden_act=text_hidden_act,
            hidden_dropout_prob=text_hidden_dropout_prob,
            attention_probs_dropout_prob=text_attention_probs_dropout_prob,
            max_position_embeddings=text_max_position_embeddings,
            type_vocab_size=text_type_vocab_size,
            initializer_range=text_initializer_range,
            layer_norm_eps=1e-12,
        )
        self.bert = BertModel(self.bert_config)

        self.text_projection = nn.Parameter(
            torch.empty(text_hidden_size, embed_dim))
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

        self.tokenizer = tokenizer

        self.initialize_parameters()

    def initialize_parameters(self):
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

        if isinstance(self.visual, ModifiedResNet):
            if self.visual.attnpool is not None:
                std = self.visual.attnpool.c_proj.in_features**-0.5
                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

            for resnet_block in [
                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
                    self.visual.layer4
            ]:
                for name, param in resnet_block.named_parameters():
                    if name.endswith('bn3.weight'):
                        nn.init.zeros_(param)

        if self.text_projection is not None:
            nn.init.normal_(
                self.text_projection, std=self.bert_config.hidden_size**-0.5)

    @property
    def dtype(self):
        return self.visual.conv1.weight.dtype

    def encode_image(self, image):
        return self.visual(image.type(self.dtype))

    def encode_text(self, text):
        pad_index = self.tokenizer.vocab['[PAD]']
        attn_mask = text.ne(pad_index).type(self.dtype)
        x = self.bert(
            text, attention_mask=attn_mask)[0].type(
                self.dtype)  # [batch_size, seq_length, hidden_size]
        return x[:, 0, :] @ self.text_projection

    def forward(self, image, text):
        assert image is not None or text is not None, 'text and image cannot both be None!'

        if image is None:
            return self.encode_text(text)
        elif text is None:
            return self.encode_image(image)
        image_features = self.encode_image(image)
        text_features = self.encode_text(text)

        image_features = image_features / image_features.norm(
            dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(
            dim=-1, keepdim=True)

        return image_features, text_features, self.logit_scale.exp()

    def get_similarity(self, image, text):
        image_features = self.encode_image(image)
        text_features = self.encode_text(text)

        # normalized features
        image_features = image_features / image_features.norm(
            dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        # shape = [global_batch_size, global_batch_size]
        return logits_per_image, logits_per_text


 def convert_models_to_fp32(model):
    for p in model.parameters():
        p.data = p.data.float()
        if p.grad:
            p.grad.data = p.grad.data.float()


 def convert_weights(model: nn.Module):
    """Convert applicable model parameters to fp16"""

    def _convert_weights_to_fp16(module):
        if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)):
            module.weight.data = module.weight.data.half()
            if module.bias is not None:
                module.bias.data = module.bias.data.half()

        if isinstance(module, nn.MultiheadAttention):
            for attr in [
                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
                    'in_proj_bias', 'bias_k', 'bias_v'
            ]:
                tensor = getattr(module, attr)
                if tensor is not None:
                    tensor.data = tensor.data.half()

        if isinstance(module, BertModel):
            module.to(torch.half)

        for name in ['text_projection', 'proj']:
            if hasattr(module, name):
                attr = getattr(module, name)
                if attr is not None:
                    attr.data = attr.data.half()

    model.apply(_convert_weights_to_fp16)


 def _convert_to_rgb(image):
    return image.convert('RGB')


 def image_transform(image_size=224):
    transform = Compose([
        _convert_to_rgb,
        Resize((image_size, image_size)),
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073),
                  (0.26862954, 0.26130258, 0.27577711)),
    ])
    return transform


@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip)
 class CLIPForMultiModalEmbedding(TorchModel):

    def __init__(self, model_dir, device_id=-1):
        super().__init__(model_dir=model_dir, device_id=device_id)

        # Initialize the model.
        vision_model_config_file = '{}/vision_model_config.json'.format(
            model_dir)
        logger.info(
            f'Loading vision model config from {vision_model_config_file}')
        assert os.path.exists(vision_model_config_file)

        text_model_config_file = '{}/text_model_config.json'.format(model_dir)
        logger.info(f'Loading text model config from {text_model_config_file}')
        assert os.path.exists(text_model_config_file)

        with open(vision_model_config_file,
                  'r') as fv, open(text_model_config_file, 'r') as ft:
            model_info = json.load(fv)
            for k, v in json.load(ft).items():
                model_info[k] = v

        # image preprocess
        self.img_preprocess = image_transform(model_info['image_resolution'])

        # text tokenizer
        vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}'
        self.tokenizer = FullTokenizer(vocab_file=vocab_file)

        # initialize the model
        self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer)
        convert_weights(self.clip_model)

        # restore the pretrained weight
        checkpoint = torch.load(
            f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu')
        sd = checkpoint['state_dict']
        if next(iter(sd.items()))[0].startswith('module'):
            sd = {k[len('module.'):]: v for k, v in sd.items()}
        self.clip_model.load_state_dict(sd)
        self.clip_model.eval()

        # place the model
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        if self.device == 'cuda':
            self.clip_model.to(self.device)
            logger.info('Use GPU for inference')
        else:
            self.clip_model.float()
            logger.info('Use CPU for inference')

    def tokenize(self,
                 texts: Union[str, List[str]],
                 context_length: int = 52) -> torch.LongTensor:
        """
        Returns the tokenized representation of given input string(s)
        Parameters
        ----------
        texts : Union[str, List[str]]
            An input string or a list of input strings to tokenize
        context_length : int
            The context length to use; all baseline models use 24 as the context length
        Returns
        -------
        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
        """
        if isinstance(texts, str):
            texts = [texts]

        all_tokens = []
        for text in texts:
            all_tokens.append(
                [self.tokenizer.vocab['[CLS]']]
                + self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(text))[:context_length - 2]
                + [self.tokenizer.vocab['[SEP]']])

        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

        for i, tokens in enumerate(all_tokens):
            assert len(tokens) <= context_length
            result[i, :len(tokens)] = torch.tensor(tokens)

        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        from modelscope.outputs import OutputKeys
        output = {
            OutputKeys.IMG_EMBEDDING: None,
            OutputKeys.TEXT_EMBEDDING: None
        }
        if 'img' in input and input['img'] is not None:
            image_input = input['img']

            # single image input
            if isinstance(image_input, Image.Image):
                image_tensor = self.img_preprocess(image_input).unsqueeze(0)
            # multi images input
            elif isinstance(image_input, list):
                if all([isinstance(elem, Image.Image)
                        for elem in image_input]):
                    image_tensor = torch.stack(
                        [self.img_preprocess(elem) for elem in image_input],
                        dim=0)
                else:
                    unsupported_elem_type = [
                        type(elem) for elem in image_input
                        if not isinstance(elem, Image.Image)
                    ][0]
                    raise TypeError(
                        f'img should be PIL.Image or List[PIL.Image], \
                            but got a List containing one {unsupported_elem_type}'
                    )
            # others
            else:
                raise TypeError(
                    f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}'
                )

            image_tensor = image_tensor.to(self.device)

            with torch.no_grad():
                image_features = self.clip_model.encode_image(image_tensor)
                image_features /= image_features.norm(
                    dim=-1, keepdim=True)  # l2-normalize

            output[OutputKeys.IMG_EMBEDDING] = image_features

        if 'text' in input and input['text'] is not None:
            text_input = input['text']

            # single text input
            if isinstance(text_input, str):
                text_tensor = self.tokenize(text_input)
            # multi texts input
            elif isinstance(text_input, list):
                if all([isinstance(elem, str) for elem in text_input]):
                    text_tensor = self.tokenize(text_input)
                else:
                    unsupported_elem_type = [
                        type(elem) for elem in text_input
                        if not isinstance(elem, str)
                    ][0]
                    raise TypeError(
                        f'text should be str or List[str], but got a List containing one {unsupported_elem_type}'
                    )
            # others
            else:
                raise TypeError(
                    f'text should be str or List[str], but got {type(text_input)}'
                )

            text_tensor = text_tensor.to(self.device)

            with torch.no_grad():
                text_features = self.clip_model.encode_text(text_tensor)
                text_features /= text_features.norm(
                    dim=-1, keepdim=True)  # l2-normalize
            output[OutputKeys.TEXT_EMBEDDING] = text_features

        return output

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs

    @property
    def temperature(self):
        return 1.0 / self.clip_model.logit_scale.exp()
--- a/modelscope/models/multi_modal/clip/modeling_bert.py
+++ b/modelscope/models/multi_modal/clip/modeling_bert.py
@@ -0,0 +1,507 @@
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch BERT model. """

 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import logging
 import math
 import os
 import sys
 from io import open

 import json
 import torch
 from torch import nn

 from .configuration_bert import BertConfig

 logger = logging.getLogger(__name__)


 def gelu(x):
    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


 def gelu_new(x):
    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1 + torch.tanh(
        math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


 def swish(x):
    return x * torch.sigmoid(x)


 ACT2FN = {
    'gelu': gelu,
    'relu': torch.nn.functional.relu,
    'swish': swish,
    'gelu_new': gelu_new
 }

 BertLayerNorm = torch.nn.LayerNorm


 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """

    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
                                                  config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(
            config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        seq_length = input_ids.size(1)
        if position_ids is None:
            position_ids = torch.arange(
                seq_length, dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


 class BertSelfAttention(nn.Module):

    def __init__(self, config):
        super(BertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                'The hidden size (%d) is not a multiple of the number of attention '
                'heads (%d)' %
                (config.hidden_size, config.num_attention_heads))
        self.output_attentions = config.output_attentions

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size
                                       / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
                                       self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer,
                                        key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(
            self.attention_head_size)
        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (
            self.all_head_size, )
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = (context_layer,
                   attention_probs) if self.output_attentions else (
                       context_layer, )
        return outputs


 class BertSelfOutput(nn.Module):

    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(
            config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


 class BertAttention(nn.Module):

    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)
        self.pruned_heads = set()

    def forward(self, input_tensor, attention_mask=None, head_mask=None):
        self_outputs = self.self(input_tensor, attention_mask, head_mask)
        attention_output = self.output(self_outputs[0], input_tensor)
        outputs = (attention_output,
                   ) + self_outputs[1:]  # add attentions if we output them
        return outputs


 class BertIntermediate(nn.Module):

    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act,
                      str) or (sys.version_info[0] == 2
                               and isinstance(config.hidden_act, unicode)):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


 class BertOutput(nn.Module):

    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(
            config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


 class BertLayer(nn.Module):

    def __init__(self, config):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attention_outputs = self.attention(hidden_states, attention_mask,
                                           head_mask)
        attention_output = attention_outputs[0]
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        outputs = (layer_output, ) + attention_outputs[
            1:]  # add attentions if we output them
        return outputs


 class BertEncoder(nn.Module):

    def __init__(self, config):
        super(BertEncoder, self).__init__()
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.layer = nn.ModuleList(
            [BertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states, )

            layer_outputs = layer_module(hidden_states, attention_mask,
                                         head_mask[i])
            hidden_states = layer_outputs[0]

            if self.output_attentions:
                all_attentions = all_attentions + (layer_outputs[1], )

        # Add last layer
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states, )

        outputs = (hidden_states, )
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states, )
        if self.output_attentions:
            outputs = outputs + (all_attentions, )
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)


 class BertPooler(nn.Module):

    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


 class BertPredictionHeadTransform(nn.Module):

    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act,
                      str) or (sys.version_info[0] == 2
                               and isinstance(config.hidden_act, unicode)):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = BertLayerNorm(
            config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


 class BertLMPredictionHead(nn.Module):

    def __init__(self, config):
        super(BertLMPredictionHead, self).__init__()
        self.transform = BertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(
            config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias
        return hidden_states


 class BertOnlyMLMHead(nn.Module):

    def __init__(self, config):
        super(BertOnlyMLMHead, self).__init__()
        self.predictions = BertLMPredictionHead(config)

    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


 class BertOnlyNSPHead(nn.Module):

    def __init__(self, config):
        super(BertOnlyNSPHead, self).__init__()
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, pooled_output):
        seq_relationship_score = self.seq_relationship(pooled_output)
        return seq_relationship_score


 class BertPreTrainingHeads(nn.Module):

    def __init__(self, config):
        super(BertPreTrainingHeads, self).__init__()
        self.predictions = BertLMPredictionHead(config)
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score


 class BertPreTrainedModel(nn.Module):
    config_class = BertConfig
    base_model_prefix = 'bert'

    def __init__(self, config):
        super(BertPreTrainedModel, self).__init__()
        self.config = config

    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()


 class BertModel(BertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the model.
        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer)
            of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax,
            used to compute the weighted average in the self-attention heads.

    Examples::

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """

    def __init__(self, config):
        super(BertModel, self).__init__(config)

        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)

        self.apply(self._init_weights)

    def forward(self,
                input_ids,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None):
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # We create a 3D attention mask from a 2D tensor mask.
        # Sizes are [batch_size, 1, 1, to_seq_length]
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        extended_attention_mask = extended_attention_mask.to(
            dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if head_mask is not None:
            if head_mask.dim() == 1:
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
                    -1).unsqueeze(-1)
                head_mask = head_mask.expand(self.config.num_hidden_layers, -1,
                                             -1, -1, -1)
            elif head_mask.dim() == 2:
                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
                    -1)  # We can specify head_mask for each layer
            head_mask = head_mask.to(dtype=next(self.parameters(
            )).dtype)  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.config.num_hidden_layers

        embedding_output = self.embeddings(
            input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids)
        encoder_outputs = self.encoder(
            embedding_output, extended_attention_mask, head_mask=head_mask)
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output)

        outputs = (
            sequence_output,
            pooled_output,
        ) + encoder_outputs[
            1:]  # add hidden_states and attentions if they are here
        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -136,7 +136,7 @@ class DiffusionForTextToImageSynthesis(Model):
        self.unet_upsampler_1024 = diffusion_model.unet_upsampler_1024

        # text tokenizer
        vocab_path = '{}/vocab.txt'.format(model_dir)
        vocab_path = f'{model_dir}/{ModelFile.VOCAB_FILE}'
        self.tokenizer = Tokenizer(vocab_file=vocab_path, seq_len=64)

        # diffusion process
--- a/modelscope/models/multi_modal/gemm/gemm_base.py
+++ b/modelscope/models/multi_modal/gemm/gemm_base.py
@@ -491,7 +491,9 @@ class GEVL(nn.Module):
            gen_logits = self.to_logits(out_embs[-1:, ...])
            probs = F.softmax(self.gen_logit_scale.exp() * gen_logits, dim=-1)
            pred = torch.argmax(
                probs * (1.0 + torch.rand_like(probs)), axis=-1)
                probs * (2.0 + torch.rand_like(probs)), axis=-1)
            if int(pred) >= eot_token or int(pred) <= 0:
                break
            pred_tokens.append(pred)
            text_input = torch.cat(
                [text_input, pred.permute(1, 0).contiguous()], axis=1)
@@ -500,8 +502,6 @@ class GEVL(nn.Module):
        for out_tokens in pred_text_tokens:
            tokens = []
            for x in out_tokens:
                if x >= eot_token or x <= 0:
                    break
                tokens.append(int(x))
            out_text = self.tokenizer.decode(tokens)
            out_text = out_text.strip()
--- a/modelscope/models/multi_modal/mplug/init.py
+++ b/modelscope/models/multi_modal/mplug/init.py
@@ -14,5 +14,4 @@
 # limitations under the License.

 from .configuration_mplug import MPlugConfig
 from .modeling_mplug import (CONFIG_NAME, VOCAB_NAME,
                             MPlugForVisualQuestionAnswering)
 from .modeling_mplug import CONFIG_NAME, MPlug
--- a/modelscope/models/multi_modal/mplug/clip/clip.py
+++ b/modelscope/models/multi_modal/mplug/clip/clip.py
@@ -5,9 +5,69 @@ from typing import Tuple, Union

 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
 from torch import nn

 from modelscope.models.multi_modal.clip.clip_vit import Transformer

 class QuickGELU(nn.Module):

    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


 class ResidualAttentionBlock(nn.Module):

    def __init__(self,
                 d_model: int,
                 n_head: int,
                 attn_mask: torch.Tensor = None):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(
            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
                         ('gelu', QuickGELU()),
                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(
            dtype=x.dtype,
            device=x.device) if self.attn_mask is not None else None
        return self.attn(
            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


 class Transformer(nn.Module):

    def __init__(self,
                 width: int,
                 layers: int,
                 heads: int,
                 attn_mask: torch.Tensor = None,
                 use_grad_ckp: bool = True):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[
            ResidualAttentionBlock(width, heads, attn_mask)
            for _ in range(layers)
        ])
        self.use_grad_ckp = use_grad_ckp

    def forward(self, x: torch.Tensor):
        if self.use_grad_ckp:
            for each_block in self.resblocks:
                x = checkpoint.checkpoint(each_block, x)
            return x
        else:
            return self.resblocks(x)


 class Bottleneck(nn.Module):
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -15,14 +15,14 @@
 # limitations under the License.
 """ MPLUG model configuration """
 import os
 from collections import OrderedDict
 from typing import Any, Dict, Mapping, Union
 from typing import Any, Dict, Union

 import yaml
 from transformers import PretrainedConfig
 from transformers.onnx import OnnxConfig
 from transformers.utils import logging

 from modelscope.utils.constant import Tasks

 logger = logging.get_logger(__name__)


@@ -32,6 +32,7 @@ class MPlugConfig(PretrainedConfig):

    def __init__(
            self,
            task=Tasks.visual_question_answering,
            bert_config='config_bert.json',
            image_res=504,
            batch_size_train=128,
@@ -64,7 +65,9 @@ class MPlugConfig(PretrainedConfig):
            clip_transformer_heads=12,
            clip_transformer_layers=12,
            **kwargs):

        super().__init__(**kwargs)
        self.task = task
        self.bert_config = bert_config
        self.image_res = image_res
        self.batch_size_train = batch_size_train
@@ -103,23 +106,3 @@ class MPlugConfig(PretrainedConfig):
        with open(yaml_file, 'r') as reader:
            config_dict = yaml.load(reader, Loader=yaml.Loader)
        return cls(**config_dict)


 class MPlugOnnxConfig(OnnxConfig):

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict([
            ('input_ids', {
                0: 'batch',
                1: 'sequence'
            }),
            ('attention_mask', {
                0: 'batch',
                1: 'sequence'
            }),
            ('token_type_ids', {
                0: 'batch',
                1: 'sequence'
            }),
        ])
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -42,14 +42,13 @@ from transformers.utils import logging

 from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
 from modelscope.models.multi_modal.mplug.predictor import TextGenerator
 from modelscope.utils.constant import ModelFile

 transformers.logging.set_verbosity_error()

 logger = logging.get_logger(__name__)

 CONFIG_NAME = 'config.yaml'
 WEIGHTS_NAME = 'pytorch_model.bin'
 VOCAB_NAME = 'vocab.txt'

 _CONFIG_FOR_DOC = 'BertConfig'
 _TOKENIZER_FOR_DOC = 'BertTokenizer'
@@ -1726,32 +1725,145 @@ class BertLMHeadModel(BertPreTrainedModel):
        return reordered_past


 class MPlugForVisualQuestionAnswering(PreTrainedModel):
 class BertPrefixModel(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r'pooler']
    _keys_to_ignore_on_load_missing = [
        r'position_ids', r'predictions.decoder.bias'
    ]

    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config, add_pooling_layer=False)
        self.cls = BertOnlyMLMHead(config)

        self.init_weights()

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(
        BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint='bert-base-uncased',
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        labels=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        is_decoder=True,
        reduction='mean',
        soft_labels=None,
        alpha=0,
        return_logits=False,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
            use_cache = False

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            is_decoder=is_decoder,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        if return_logits:
            return prediction_scores[:, :-1, :].contiguous()

        lm_loss = None
        if labels is not None:
            # we are doing next-token prediction; shift prediction scores and input ids by one
            shifted_prediction_scores = prediction_scores[:, :
                                                          -1, :].contiguous()
            labels = labels[:, 1:].contiguous()
            loss_fct = CrossEntropyLoss()
            lm_loss = loss_fct(
                shifted_prediction_scores.view(-1, self.config.vocab_size),
                labels.view(-1))
        if soft_labels is not None:
            loss_distill = -torch.sum(
                F.log_softmax(shifted_prediction_scores, dim=1) * soft_labels,
                dim=-1)
            loss_distill = loss_distill[labels != -100].mean()
            lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill

        if not return_dict:
            output = (prediction_scores, ) + outputs[2:]
            return ((lm_loss, ) + output) if lm_loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=lm_loss,
            logits=prediction_scores,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )


 class MPlug(PreTrainedModel):
    config_class = MPlugConfig

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.tokenizer = BertTokenizer.from_pretrained(
            os.path.join(config.model_dir, VOCAB_NAME))
            os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
        self.module_setting(config)
        self.visual_encoder = self._initialize_clip(config)
        self.text_encoder = BertModel(
            self.config_encoder, add_pooling_layer=False)
        self.fusion_encoder = FusionModel(
            self.config_fusion, add_pooling_layer=False)
        self.text_decoder = BertLMHeadModel(self.config_decoder)
        self.init_distill(config)
        self.beam_generator = TextGenerator(config, self.text_decoder)

    @classmethod
    def from_pretrained(cls, model_dir, load_checkpoint=True):
        config = MPlugConfig.from_yaml_file(
        from modelscope.utils.constant import Tasks

        task_mapping = {
            Tasks.visual_question_answering: MPlugForVisualQuestionAnswering,
            Tasks.image_captioning: MPLUGForImageCaption
        }
        config = cls.config_class.from_yaml_file(
            os.path.join(model_dir, CONFIG_NAME))
        config.model_dir = model_dir
        model = cls(config)
        model = task_mapping[config.task](config)
        if load_checkpoint:
            checkpoint_path = os.path.join(model_dir, WEIGHTS_NAME)
            checkpoint_path = os.path.join(model_dir,
                                           ModelFile.TORCH_MODEL_BIN_FILE)
            checkpoint = torch.load(checkpoint_path, map_location='cpu')
            if 'model' in checkpoint:
                state_dict = checkpoint['model']
@@ -1803,6 +1915,161 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel):
        clip_model.visual.positional_embedding = pos_embed
        return clip_model

    def forward(self, *args, **kwargs):
        raise NotImplementedError

    def module_setting(self, config):
        bert_config_path = os.path.join(config.model_dir, config.bert_config)
        self.config_encoder = BertConfig.from_json_file(bert_config_path)
        self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
        self.config_fusion = BertConfig.from_json_file(bert_config_path)
        self.config_decoder = BertConfig.from_json_file(bert_config_path)
        self.config_decoder.add_cross_attention = True
        self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
        self.large = False
        if self.config_encoder.hidden_size != config.vision_width:
            self.visn_fc = nn.Linear(config.vision_width,
                                     self.config_encoder.hidden_size)
            self.visn_layer_norm = nn.LayerNorm(
                self.config_encoder.hidden_size, eps=1e-12)
            self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob)
            self.large = True

    @torch.no_grad()
    def copy_params(self):
        for model_pair in self.model_pairs:
            for param, param_m in zip(model_pair[0].parameters(),
                                      model_pair[1].parameters()):
                param_m.data.copy_(param.data)  # initialize
                param_m.requires_grad = False  # not update by gradient

    @torch.no_grad()
    def _momentum_update(self):
        for model_pair in self.model_pairs:
            for param, param_m in zip(model_pair[0].parameters(),
                                      model_pair[1].parameters()):
                param_m.data = param_m.data * self.momentum + param.data * (
                    1. - self.momentum)

    def generation(self, question_states, question_atts, out_size=1):
        encoder_inputs = [question_states, question_atts]
        topk_ids, topk_scores = self.beam_generator.translate_batch(
            encoder_inputs, out_size=out_size)
        return topk_ids, topk_scores

    @staticmethod
    def _tile(x, dim, n_tile):
        import numpy as np
        init_dim = x.size(dim)
        repeat_idx = [1] * x.dim()
        repeat_idx[dim] = n_tile
        x = x.repeat(*(repeat_idx))
        order_index = torch.LongTensor(
            np.concatenate(
                [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
        return torch.index_select(x, dim, order_index.to(x.device))

    def rank_answer(self, question_states, question_atts, answer_ids,
                    answer_atts, k):

        num_ques = question_states.size(0)
        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token

        start_output = self.text_decoder(
            start_ids,
            encoder_hidden_states=question_states,
            encoder_attention_mask=question_atts,
            return_dict=True,
            reduction='none')
        logits = start_output.logits[:, 0, :]  # first token's logit

        # topk_probs: top-k probability
        # topk_ids: [num_question, k]
        answer_first_token = answer_ids[:, 1]
        prob_first_token = F.softmax(
            logits, dim=1).index_select(
                dim=1, index=answer_first_token)
        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)

        # answer input: [num_question*k, answer_len]
        input_ids = []
        input_atts = []
        for b, topk_id in enumerate(topk_ids):
            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
        input_ids = torch.cat(input_ids, dim=0)
        input_atts = torch.cat(input_atts, dim=0)

        targets_ids = input_ids.masked_fill(
            input_ids == self.tokenizer.pad_token_id, -100)

        # repeat encoder's output for top-k answers
        question_states = self._tile(question_states, 0, k)
        question_atts = self._tile(question_atts, 0, k)

        output = self.text_decoder(
            input_ids,
            attention_mask=input_atts,
            encoder_hidden_states=question_states,
            encoder_attention_mask=question_atts,
            labels=targets_ids,
            return_dict=True,
            reduction='none')

        answer_loss = output.loss
        answer_loss = answer_loss.view(input_ids.size(0), -1)

        # topk_prob: first token probability
        topk_probs = topk_probs.view(-1, 1)
        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)

        # re-calculate log probabilities for the answer sequences using chain rule
        log_probs_sum = log_probs.sum(1)
        log_probs_sum = log_probs_sum.view(num_ques, k)

        topk_probs = F.softmax(log_probs_sum, dim=-1)
        # get top-k after re-ranking
        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
        topk_ids = torch.gather(topk_ids, 1, rerank_id)

        return topk_ids, topk_probs


 class MPlugForVisualQuestionAnswering(MPlug):

    def __init__(self, config):
        super().__init__(config)
        self.text_decoder = BertLMHeadModel(self.config_decoder)
        self.beam_generator = TextGenerator(config, self.text_decoder)
        self.init_distill(config)

    def init_distill(self, config):
        self.distill = config.distill
        if self.distill:
            self.visual_encoder_m = self._initialize_clip(config)
            self.text_encoder_m = BertModel(
                self.config_encoder, add_pooling_layer=False)
            self.fusion_encoder_m = FusionModel(
                self.config_fusion, add_pooling_layer=False)
            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
            self.model_pairs = [
                [self.visual_encoder, self.visual_encoder_m],
                [self.text_encoder, self.text_encoder_m],
                [self.text_decoder, self.text_decoder_m],
            ]
            if self.config_encoder.hidden_size != config.vision_width:
                self.visn_fc_m = nn.Linear(config.vision_width,
                                           self.config_encoder.hidden_size)
                self.visn_layer_norm_m = nn.LayerNorm(
                    self.config_encoder.hidden_size, eps=1e-12)
                self.dropout_m = nn.Dropout(
                    self.config_encoder.hidden_dropout_prob)
                self.model_pairs.extend(
                    [[self.visn_fc, self.visn_fc_m],
                     [self.visn_layer_norm, self.visn_layer_norm_m]])
            self.copy_params()
            self.momentum = 0.995

    def forward(self,
                image,
                question,
@@ -1935,145 +2202,110 @@ class MPlugForVisualQuestionAnswering(PreTrainedModel):
                                                   merge_text_attention)
            return topk_ids, topk_probs

    def module_setting(self, config):
        bert_config_path = os.path.join(config.model_dir, config.bert_config)
        self.config_encoder = BertConfig.from_json_file(bert_config_path)
        self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
        self.config_fusion = BertConfig.from_json_file(bert_config_path)
        self.config_decoder = BertConfig.from_json_file(bert_config_path)
        self.config_decoder.add_cross_attention = True
        self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
        self.large = False
        if self.config_encoder.hidden_size != config.vision_width:
            self.visn_fc = nn.Linear(config.vision_width,
                                     self.config_encoder.hidden_size)
            self.visn_layer_norm = nn.LayerNorm(
                self.config_encoder.hidden_size, eps=1e-12)
            self.dropout = nn.Dropout(self.config_encoder.hidden_dropout_prob)
            self.large = True

    def init_distill(self, config):
        self.distill = config.distill
        if self.distill:
            self.visual_encoder_m = self._initialize_clip(config)
            self.text_encoder_m = BertModel(
                self.config_encoder, add_pooling_layer=False)
            self.fusion_encoder_m = FusionModel(
                self.config_fusion, add_pooling_layer=False)
            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
            self.model_pairs = [
                [self.visual_encoder, self.visual_encoder_m],
                [self.text_encoder, self.text_encoder_m],
                [self.text_decoder, self.text_decoder_m],
            ]
            if self.config_encoder.hidden_size != config.vision_width:
                self.visn_fc_m = nn.Linear(config.vision_width,
                                           self.config_encoder.hidden_size)
                self.visn_layer_norm_m = nn.LayerNorm(
                    self.config_encoder.hidden_size, eps=1e-12)
                self.dropout_m = nn.Dropout(
                    self.config_encoder.hidden_dropout_prob)
                self.model_pairs.extend(
                    [[self.visn_fc, self.visn_fc_m],
                     [self.visn_layer_norm, self.visn_layer_norm_m]])
            self.copy_params()
            self.momentum = 0.995

    @torch.no_grad()
    def copy_params(self):
        for model_pair in self.model_pairs:
            for param, param_m in zip(model_pair[0].parameters(),
                                      model_pair[1].parameters()):
                param_m.data.copy_(param.data)  # initialize
                param_m.requires_grad = False  # not update by gradient

    @torch.no_grad()
    def _momentum_update(self):
        for model_pair in self.model_pairs:
            for param, param_m in zip(model_pair[0].parameters(),
                                      model_pair[1].parameters()):
                param_m.data = param_m.data * self.momentum + param.data * (
                    1. - self.momentum)

    def generation(self, question_states, question_atts):
        encoder_inputs = [question_states, question_atts]
        topk_ids, topk_scores = self.beam_generator.translate_batch(
            encoder_inputs)
        return topk_ids, topk_scores

    @staticmethod
    def _tile(x, dim, n_tile):
        import numpy as np
        init_dim = x.size(dim)
        repeat_idx = [1] * x.dim()
        repeat_idx[dim] = n_tile
        x = x.repeat(*(repeat_idx))
        order_index = torch.LongTensor(
            np.concatenate(
                [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
        return torch.index_select(x, dim, order_index.to(x.device))

    def rank_answer(self, question_states, question_atts, answer_ids,
                    answer_atts, k):

        num_ques = question_states.size(0)
        start_ids = answer_ids[0, 0].repeat(num_ques, 1)  # bos token

        start_output = self.text_decoder(
            start_ids,
            encoder_hidden_states=question_states,
            encoder_attention_mask=question_atts,
            return_dict=True,
            reduction='none')
        logits = start_output.logits[:, 0, :]  # first token's logit
 class MPLUGForImageCaption(MPlug):

        # topk_probs: top-k probability
        # topk_ids: [num_question, k]
        answer_first_token = answer_ids[:, 1]
        prob_first_token = F.softmax(
            logits, dim=1).index_select(
                dim=1, index=answer_first_token)
        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)

        # answer input: [num_question*k, answer_len]
        input_ids = []
        input_atts = []
        for b, topk_id in enumerate(topk_ids):
            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
        input_ids = torch.cat(input_ids, dim=0)
        input_atts = torch.cat(input_atts, dim=0)

        targets_ids = input_ids.masked_fill(
            input_ids == self.tokenizer.pad_token_id, -100)

        # repeat encoder's output for top-k answers
        question_states = self._tile(question_states, 0, k)
        question_atts = self._tile(question_atts, 0, k)
    def __init__(self, config):
        super().__init__(config)
        self.text_decoder = BertPrefixModel(self.config_decoder)
        self.beam_generator = TextGenerator(config, self.text_decoder)

        output = self.text_decoder(
            input_ids,
            attention_mask=input_atts,
            encoder_hidden_states=question_states,
            encoder_attention_mask=question_atts,
            labels=targets_ids,
            return_dict=True,
            reduction='none')
    def beam_search(self,
                    image,
                    question,
                    answer=None,
                    train=True,
                    out_size=5):
        image_embeds = self.visual_encoder.visual(image, skip_last_layer=True)
        if self.large:
            image_embeds = self.dropout(
                self.visn_layer_norm(self.visn_fc(image_embeds)))
        image_atts = torch.ones(
            image_embeds.size()[:-1], dtype=torch.long).to(image.device)
        text_output = self.text_encoder(
            question.input_ids,
            attention_mask=question.attention_mask,
            return_dict=True)
        text_embeds = text_output.last_hidden_state
        fusion_output = self.fusion_encoder(
            encoder_embeds=text_embeds,
            attention_mask=question.attention_mask,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_atts,
            return_dict=False)
        image_output, question_output = fusion_output
        question_output = torch.cat([image_output, question_output], 1)
        merge_text_attention = torch.cat([image_atts, question.attention_mask],
                                         1)
        topk_ids, topk_probs = self.generation(
            question_output, merge_text_attention, out_size=out_size)
        return topk_ids, topk_probs

        answer_loss = output.loss
        answer_loss = answer_loss.view(input_ids.size(0), -1)
    def forward(self,
                image,
                question,
                answer=None,
                train=True,
                out_size=5,
                scst=False):
        if (scst):
            return self.beam_search(
                image, question, answer, train=True, out_size=out_size)
        image = image.to(dtype=next(self.parameters()).dtype)
        image_embeds = self.visual_encoder.visual(image, skip_last_layer=True)
        if self.large:
            image_embeds = self.dropout(
                self.visn_layer_norm(self.visn_fc(image_embeds)))
        image_atts = torch.ones(
            image_embeds.size()[:-1], dtype=torch.long).to(image.device)

        # topk_prob: first token probability
        topk_probs = topk_probs.view(-1, 1)
        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
        if train:
            answer_targets = answer.input_ids.masked_fill(
                answer.input_ids == self.tokenizer.pad_token_id, -100)
            text_output = self.text_encoder(
                question.input_ids,
                attention_mask=question.attention_mask,
                return_dict=True)
            text_embeds = text_output.last_hidden_state
            fusion_output = self.fusion_encoder(
                encoder_embeds=text_embeds,
                attention_mask=question.attention_mask,
                encoder_hidden_states=image_embeds,
                encoder_attention_mask=image_atts,
                return_dict=False)

        # re-calculate log probabilities for the answer sequences using chain rule
        log_probs_sum = log_probs.sum(1)
        log_probs_sum = log_probs_sum.view(num_ques, k)
            image_output, question_output = fusion_output

        topk_probs = F.softmax(log_probs_sum, dim=-1)
        # get top-k after re-ranking
        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
        topk_ids = torch.gather(topk_ids, 1, rerank_id)
            question_output = torch.cat([image_output, question_output], 1)
            merge_text_attention = torch.cat(
                [image_atts, question.attention_mask], 1)

        return topk_ids, topk_probs
            answer_output = self.text_decoder(
                answer.input_ids,
                attention_mask=answer.attention_mask,
                encoder_hidden_states=question_output,
                encoder_attention_mask=merge_text_attention,
                labels=answer_targets,
                return_dict=True,
                reduction='none')
            loss = answer_output.loss
            return loss
        else:
            text_output = self.text_encoder(
                question.input_ids,
                attention_mask=question.attention_mask,
                return_dict=True)
            text_embeds = text_output.last_hidden_state
            fusion_output = self.fusion_encoder(
                encoder_embeds=text_embeds,
                attention_mask=question.attention_mask,
                encoder_hidden_states=image_embeds,
                encoder_attention_mask=image_atts,
                return_dict=False)
            image_output, question_output = fusion_output
            question_output = torch.cat([image_output, question_output], 1)
            merge_text_attention = torch.cat(
                [image_atts, question.attention_mask], 1)
            topk_ids, topk_probs = self.generation(question_output,
                                                   merge_text_attention)
            return topk_ids, topk_probs
--- a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
+++ b/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
@@ -6,12 +6,13 @@ from modelscope.models.base import Tensor
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Tasks

 __all__ = ['MPlugForVisualQuestionAnswering']
 __all__ = ['MPlugForAllTasks']


@MODELS.register_module(
    Tasks.visual_question_answering, module_name=Models.mplug)
 class MPlugForVisualQuestionAnswering(TorchModel):
@MODELS.register_module(Tasks.image_captioning, module_name=Models.mplug)
 class MPlugForAllTasks(TorchModel):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the mplug model from the `model_dir` path.
@@ -20,8 +21,8 @@ class MPlugForVisualQuestionAnswering(TorchModel):
        """

        super().__init__(model_dir, *args, **kwargs)
        from modelscope.models.multi_modal.mplug import MPlugForVisualQuestionAnswering
        self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir)
        from modelscope.models.multi_modal.mplug import MPlug
        self.model = MPlug.from_pretrained(model_dir)
        self.tokenizer = self.model.tokenizer

    def train(self):
@@ -44,4 +45,13 @@ class MPlugForVisualQuestionAnswering(TorchModel):
                    }
        """

        return self.model(**input)[0]
        topk_ids, _ = self.model(**input)
        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))

        pred_string = self.tokenizer.decode(topk_ids[0][0])
        for _old, _new in replace_tokens_bert:
            pred_string = pred_string.replace(_old, _new)
        pred_string = pred_string.strip()
        return pred_string
--- a/modelscope/models/multi_modal/ofa/tokenization_ofa.py
+++ b/modelscope/models/multi_modal/ofa/tokenization_ofa.py
@@ -22,6 +22,8 @@ from transformers.models.bert.tokenization_bert import (BasicTokenizer,
                                                        WordpieceTokenizer)
 from transformers.utils import logging

 from modelscope.utils.constant import ModelFile

 logger = logging.get_logger(__name__)

 VOCAB_FILES_NAMES = {'vocab_file': 'vocab.json', 'merges_file': 'merges.txt'}
@@ -42,7 +44,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'ofa-base': 1024,
 }

 VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'}
 VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}

 PRETRAINED_VOCAB_FILES_MAP_ZH = {
    'vocab_file': {
--- a/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
+++ b/modelscope/models/multi_modal/ofa/tokenization_ofa_fast.py
@@ -20,6 +20,7 @@ from transformers import PreTrainedTokenizerFast
 from transformers.models.bart.tokenization_bart_fast import BartTokenizerFast
 from transformers.utils import logging

 from modelscope.utils.constant import ModelFile
 from .tokenization_ofa import OFATokenizer, OFATokenizerZH

 logger = logging.get_logger(__name__)
@@ -50,7 +51,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'ofa-base': 1024,
 }

 VOCAB_FILES_NAMES_ZH = {'vocab_file': 'vocab.txt'}
 VOCAB_FILES_NAMES_ZH = {'vocab_file': ModelFile.VOCAB_FILE}

 PRETRAINED_VOCAB_FILES_MAP_ZH = {
    'vocab_file': {
--- a/modelscope/models/nlp/structbert/tokenization_sbert.py
+++ b/modelscope/models/nlp/structbert/tokenization_sbert.py
@@ -23,11 +23,12 @@ from typing import List, Optional, Tuple
 from transformers.tokenization_utils import (PreTrainedTokenizer, _is_control,
                                             _is_punctuation, _is_whitespace)

 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger

 logger = get_logger(__name__)

 VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 VOCAB_FILES_NAMES = {'vocab_file': ModelFile.VOCAB_FILE}

 PRETRAINED_VOCAB_FILES_MAP = {'vocab_file': {}}

--- a/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
+++ b/modelscope/models/nlp/structbert/tokenization_sbert_fast.py
@@ -22,13 +22,14 @@ import transformers
 from tokenizers import normalizers
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from .tokenization_sbert import SbertTokenizer

 logger = get_logger(__name__)

 VOCAB_FILES_NAMES = {
    'vocab_file': 'vocab.txt',
    'vocab_file': ModelFile.VOCAB_FILE,
    'tokenizer_file': 'tokenizer.json'
 }

--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -13,9 +13,12 @@ from datasets.utils.file_utils import (is_relative_path,
                                       relative_to_absolute_path)

 from modelscope.msdatasets.config import MS_DATASETS_CACHE
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       DatasetFormations, DownloadMode, Hubs)
 from modelscope.utils.logger import get_logger
 from .task_datasets.builder import build_task_dataset
 from .utils.dataset_builder import ExternalDataset
 from .utils.dataset_utils import (get_dataset_files,
                                  get_target_dataset_structure,
                                  load_dataset_builder)
@@ -67,9 +70,16 @@ class MsDataset:
    def __len__(self):
        return len(self._hf_ds)

    @property
    def config_kwargs(self):
        if isinstance(self._hf_ds, ExternalDataset):
            return self._hf_ds.config_kwargs
        else:
            return None

    @classmethod
    def from_hf_dataset(cls,
                        hf_ds: Union[Dataset, DatasetDict],
                        hf_ds: Union[Dataset, DatasetDict, ExternalDataset],
                        target: str = None) -> Union[dict, 'MsDataset']:
        if isinstance(hf_ds, Dataset):
            return cls(hf_ds, target)
@@ -77,6 +87,8 @@ class MsDataset:
            if len(hf_ds.keys()) == 1:
                return cls(next(iter(hf_ds.values())), target)
            return {k: cls(v, target) for k, v in hf_ds.items()}
        elif isinstance(hf_ds, ExternalDataset):
            return cls(hf_ds)
        else:
            raise TypeError(
                f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
@@ -96,7 +108,8 @@ class MsDataset:
                                   Mapping[str, Union[str,
                                                      Sequence[str]]]]] = None,
        download_mode: Optional[DownloadMode] = DownloadMode.
        REUSE_DATASET_IF_EXISTS
        REUSE_DATASET_IF_EXISTS,
        **config_kwargs,
    ) -> Union[dict, 'MsDataset']:
        """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
            Args:
@@ -113,6 +126,7 @@ class MsDataset:
                hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
                download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                               DownloadMode.REUSE_DATASET_IF_EXISTS
                **config_kwargs (additional keyword arguments): Keyword arguments to be passed

            Returns:
                MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
@@ -128,7 +142,8 @@ class MsDataset:
                split=split,
                data_dir=data_dir,
                data_files=data_files,
                download_mode=download_mode.value)
                download_mode=download_mode.value,
                **config_kwargs)
            return MsDataset.from_hf_dataset(dataset, target=target)
        elif hub == Hubs.modelscope:
            return MsDataset._load_ms_dataset(
@@ -140,22 +155,22 @@ class MsDataset:
                split=split,
                data_dir=data_dir,
                data_files=data_files,
                download_mode=download_mode)
                download_mode=download_mode,
                **config_kwargs)

    @staticmethod
    def _load_ms_dataset(
        dataset_name: Union[str, list],
        namespace: Optional[str] = None,
        target: Optional[str] = None,
        version: Optional[str] = DEFAULT_DATASET_REVISION,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        data_dir: Optional[str] = None,
        data_files: Optional[Union[str, Sequence[str],
                                   Mapping[str, Union[str,
                                                      Sequence[str]]]]] = None,
        download_mode: Optional[DownloadMode] = None
    ) -> Union[dict, 'MsDataset']:
    def _load_ms_dataset(dataset_name: Union[str, list],
                         namespace: Optional[str] = None,
                         target: Optional[str] = None,
                         version: Optional[str] = DEFAULT_DATASET_REVISION,
                         subset_name: Optional[str] = None,
                         split: Optional[str] = None,
                         data_dir: Optional[str] = None,
                         data_files: Optional[Union[
                             str, Sequence[str],
                             Mapping[str, Union[str, Sequence[str]]]]] = None,
                         download_mode: Optional[DownloadMode] = None,
                         **config_kwargs) -> Union[dict, 'MsDataset']:
        if isinstance(dataset_name, str):
            dataset_formation = DatasetFormations.native
            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
@@ -184,7 +199,8 @@ class MsDataset:
                    data_dir=data_dir,
                    data_files=data_files,
                    cache_dir=MS_DATASETS_CACHE,
                    download_mode=download_mode.value)
                    download_mode=download_mode.value,
                    **config_kwargs)
            else:
                dataset = MsDataset._load_from_ms(
                    dataset_name,
@@ -195,7 +211,7 @@ class MsDataset:
                    subset_name=subset_name,
                    split=split,
                    download_mode=download_mode,
                )
                    **config_kwargs)
        elif isinstance(dataset_name, list):
            if target is None:
                target = 'target'
@@ -206,16 +222,15 @@ class MsDataset:
        return MsDataset.from_hf_dataset(dataset, target=target)

    @staticmethod
    def _load_from_ms(
        dataset_name: str,
        dataset_files: dict,
        download_dir: str,
        namespace: Optional[str] = None,
        version: Optional[str] = DEFAULT_DATASET_REVISION,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        download_mode: Optional[DownloadMode] = None,
    ) -> Union[Dataset, DatasetDict]:
    def _load_from_ms(dataset_name: str,
                      dataset_files: dict,
                      download_dir: str,
                      namespace: Optional[str] = None,
                      version: Optional[str] = DEFAULT_DATASET_REVISION,
                      subset_name: Optional[str] = None,
                      split: Optional[str] = None,
                      download_mode: Optional[DownloadMode] = None,
                      **config_kwargs) -> Union[Dataset, DatasetDict]:
        for json_path in dataset_files['.json']:
            if json_path.endswith(f'{dataset_name}.json'):
                with open(json_path, encoding='utf-8') as dataset_json_file:
@@ -226,7 +241,6 @@ class MsDataset:
        meta_map, file_map = get_dataset_files(target_dataset_structure,
                                               dataset_name, namespace,
                                               version)

        builder = load_dataset_builder(
            dataset_name,
            subset_name,
@@ -235,7 +249,8 @@ class MsDataset:
            zip_data_files=file_map,
            cache_dir=MS_DATASETS_CACHE,
            version=version,
            split=list(target_dataset_structure.keys()))
            split=list(target_dataset_structure.keys()),
            **config_kwargs)

        download_config = DownloadConfig(
            cache_dir=download_dir,
@@ -253,7 +268,6 @@ class MsDataset:
            data_dir=download_dir,
        )
        builder.download_and_prepare(
            download_config=download_config,
            dl_manager=dl_manager,
            download_mode=download_mode.value,
            try_from_hf_gcs=False)
@@ -338,6 +352,8 @@ class MsDataset:
        self,
        columns: Union[str, List[str]] = None,
        preprocessors: Union[Callable, List[Callable]] = None,
        task_name: str = None,
        task_data_config: ConfigDict = None,
        **format_kwargs,
    ):
        """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
@@ -350,6 +366,8 @@ class MsDataset:
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
                the output fields of processors will also be added.
            task_name (str, default None):  task name, refer to :obj:`Tasks` for more details
            task_data_config (ConfigDict, default None): config dict for model object.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
@@ -360,6 +378,10 @@ class MsDataset:
            raise ImportError(
                'The function to_torch_dataset requires pytorch to be installed'
            )
        if isinstance(self._hf_ds, ExternalDataset):
            task_data_config.update({'preprocessor': preprocessors})
            return build_task_dataset(task_data_config, task_name,
                                      self._hf_ds.config_kwargs)
        if preprocessors is not None:
            return self.to_torch_dataset_with_processors(
                preprocessors, columns=columns)
--- a/modelscope/msdatasets/task_datasets/init.py
+++ b/modelscope/msdatasets/task_datasets/init.py
@@ -8,6 +8,7 @@ if TYPE_CHECKING:
    from .builder import TASK_DATASETS, build_task_dataset
    from .torch_base_dataset import TorchTaskDataset
    from .veco_dataset import VecoDataset
    from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset

 else:
    _import_structure = {
@@ -15,6 +16,8 @@ else:
        'builder': ['TASK_DATASETS', 'build_task_dataset'],
        'torch_base_dataset': ['TorchTaskDataset'],
        'veco_dataset': ['VecoDataset'],
        'image_instance_segmentation_coco_dataset':
        ['ImageInstanceSegmentationCocoDataset']
    }
    import sys

--- a/modelscope/msdatasets/task_datasets/base.py
+++ b/modelscope/msdatasets/task_datasets/base.py
--- a/modelscope/msdatasets/task_datasets/builder.py
+++ b/modelscope/msdatasets/task_datasets/builder.py
--- a/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/image_instance_segmentation_coco_dataset.py
@@ -2,14 +2,32 @@ import os.path as osp

 import numpy as np
 from pycocotools.coco import COCO
 from torch.utils.data import Dataset


 class ImageInstanceSegmentationCocoDataset(Dataset):
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from .builder import TASK_DATASETS
 from .torch_base_dataset import TorchTaskDataset

 DATASET_STRUCTURE = {
    'train': {
        'annotation': 'annotations/instances_train.json',
        'images': 'images/train'
    },
    'validation': {
        'annotation': 'annotations/instances_val.json',
        'images': 'images/val'
    }
 }


@TASK_DATASETS.register_module(
    module_name=Models.cascade_mask_rcnn_swin,
    group_key=Tasks.image_segmentation)
 class ImageInstanceSegmentationCocoDataset(TorchTaskDataset):
    """Coco-style dataset for image instance segmentation.

    Args:
        ann_file (str): Annotation file path.
        split_config (dict): Annotation file path. {"train":"xxxxx"}
        classes (Sequence[str], optional): Specify classes to load.
            If is None, ``cls.CLASSES`` will be used. Default: None.
        data_root (str, optional): Data root for ``ann_file``,
@@ -37,30 +55,27 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
               'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')

    def __init__(self,
                 ann_file,
                 split_config: dict,
                 preprocessor=None,
                 classes=None,
                 data_root=None,
                 img_prefix='',
                 seg_prefix=None,
                 test_mode=False,
                 filter_empty_gt=True):
        self.ann_file = ann_file
        self.data_root = data_root
        self.img_prefix = img_prefix
                 filter_empty_gt=True,
                 **kwargs):
        self.data_root = next(iter(split_config.values()))
        self.split = next(iter(split_config.keys()))
        self.preprocessor = preprocessor

        self.ann_file = osp.join(self.data_root,
                                 DATASET_STRUCTURE[self.split]['annotation'])

        self.img_prefix = osp.join(self.data_root,
                                   DATASET_STRUCTURE[self.split]['images'])
        self.seg_prefix = seg_prefix
        self.test_mode = test_mode
        self.filter_empty_gt = filter_empty_gt
        self.CLASSES = self.get_classes(classes)

        # join paths if data_root is specified
        if self.data_root is not None:
            if not osp.isabs(self.ann_file):
                self.ann_file = osp.join(self.data_root, self.ann_file)
            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
                self.img_prefix = osp.join(self.data_root, self.img_prefix)
            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)

        # load annotations
        self.data_infos = self.load_annotations(self.ann_file)

@@ -71,8 +86,6 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
            # set group flag for the sampler
            self._set_group_flag()

        self.preprocessor = None

    def __len__(self):
        """Total number of samples of data."""
        return len(self.data_infos)
@@ -326,7 +339,3 @@ class ImageInstanceSegmentationCocoDataset(Dataset):
            raise ValueError(f'Unsupported type {type(classes)} of classes.')

        return class_names

    def to_torch_dataset(self, preprocessors=None):
        self.preprocessor = preprocessors
        return self
--- a/modelscope/msdatasets/task_datasets/torch_base_dataset.py
+++ b/modelscope/msdatasets/task_datasets/torch_base_dataset.py
--- a/modelscope/msdatasets/task_datasets/veco_dataset.py
+++ b/modelscope/msdatasets/task_datasets/veco_dataset.py
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ b/modelscope/msdatasets/utils/dataset_builder.py
@@ -8,6 +8,7 @@ from datasets.info import DatasetInfo
 from datasets.packaged_modules import csv
 from datasets.utils.filelock import FileLock

 from modelscope.utils.constant import DownloadMode
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -26,11 +27,11 @@ class MsCsvDatasetBuilder(csv.Csv):
        zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
        **config_kwargs,
    ):
        self.namespace = namespace
        super().__init__(
            cache_dir=cache_dir,
            name=subset_name,
            hash=hash,
            namespace=namespace,
            data_files=meta_data_files,
            **config_kwargs)

@@ -56,6 +57,25 @@ class MsCsvDatasetBuilder(csv.Csv):
                    os.rmdir(self._cache_dir)
        self.zip_data_files = zip_data_files

    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
        """Relative path of this dataset in cache_dir:
        Will be:
            self.name/self.config.version/self.hash/
        or if a namespace has been specified:
            self.namespace___self.name/self.config.version/self.hash/
        """
        builder_data_dir = self.name if self.namespace is None else f'{self.namespace}___{self.name}'
        builder_config = self.config
        hash = self.hash
        if builder_config:
            builder_data_dir = os.path.join(builder_data_dir, self.config_id)
        if with_version:
            builder_data_dir = os.path.join(builder_data_dir,
                                            str(self.config.version))
        if with_hash and hash and isinstance(hash, str):
            builder_data_dir = os.path.join(builder_data_dir, hash)
        return builder_data_dir

    def _build_cache_dir(self):
        builder_data_dir = os.path.join(
            self._cache_dir_root,
@@ -77,8 +97,15 @@ class MsCsvDatasetBuilder(csv.Csv):
                datasets.SplitGenerator(
                    name=split_name,
                    gen_kwargs={
                        'files': dl_manager.iter_files(files),
                        'base_dir': zip_data_files.get(split_name)
                        'files':
                        dl_manager.iter_files(files),
                        'base_dir':
                        os.path.join(
                            zip_data_files.get(split_name),
                            os.path.splitext(
                                self.zip_data_files.get(split_name))[0])
                        if self.zip_data_files.get(split_name) else
                        zip_data_files.get(split_name)
                    }))
        return splits

@@ -111,3 +138,65 @@ class MsCsvDatasetBuilder(csv.Csv):
                logger.error(
                    f"Failed to read file '{file}' with error {type(e)}: {e}")
                raise


 class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):

    def __init__(
        self,
        dataset_name: str,
        cache_dir: str,
        namespace: str,
        subset_name: str,
        hash: str,
        meta_data_files: Mapping[str, Union[str, Sequence[str]]],
        zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
        **config_kwargs,
    ):
        self.name = dataset_name
        self.subset_name = subset_name
        self.namespace = namespace
        self.hash = hash
        self.data_files = meta_data_files
        self.zip_data_files = zip_data_files
        self.split_path_dict = None
        self.config = None
        self._cache_dir_root = os.path.expanduser(cache_dir)
        self._cache_dir = self._build_cache_dir()
        self._config_kwargs = config_kwargs

    def download_and_prepare(self, download_mode, dl_manager,
                             **download_kwargs):
        # Prevent parallel disk operations
        lock_path = os.path.join(
            self._cache_dir_root,
            self._cache_dir.replace(os.sep, '_') + '.lock')
        with FileLock(lock_path):
            data_exists = os.path.exists(self._cache_dir)
            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
                logger.warning(
                    f'Reusing dataset {self.name} ({self._cache_dir})')
                return
            logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
        self._download_and_prepare(dl_manager=dl_manager)

    def _download_and_prepare(self, dl_manager):
        split_path_dict = dl_manager.download_and_extract(self.zip_data_files)
        self.split_path_dict = {
            k: os.path.join(v,
                            os.path.splitext(self.zip_data_files[k])[0])
            for k, v in split_path_dict.items()
        }

    def as_dataset(self):
        return ExternalDataset(self.split_path_dict, self._config_kwargs)


 class ExternalDataset(object):

    def __init__(self, split_path_dict, config_kwargs):
        config_kwargs.update({'split_config': split_path_dict})
        self.config_kwargs = config_kwargs

    def __len__(self):
        return len(self.config_kwargs['split_config'])
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -6,7 +6,7 @@ from datasets.builder import DatasetBuilder

 from modelscope.utils.constant import DEFAULT_DATASET_REVISION
 from modelscope.utils.logger import get_logger
 from .dataset_builder import MsCsvDatasetBuilder
 from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder

 logger = get_logger()

@@ -87,7 +87,7 @@ def get_dataset_files(subset_split_into: dict,
    modelscope_api = HubApi()
    for split, info in subset_split_into.items():
        meta_map[split] = modelscope_api.get_dataset_file_url(
            info['meta'], dataset_name, namespace, revision)
            info.get('meta', ''), dataset_name, namespace, revision)
        if info.get('file'):
            file_map[split] = info['file']
    return meta_map, file_map
@@ -99,15 +99,32 @@ def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
                         zip_data_files: Mapping[str, Union[str,
                                                            Sequence[str]]],
                         cache_dir: str, version: Optional[Union[str]],
                         split: Sequence[str]) -> DatasetBuilder:
                         split: Sequence[str],
                         **config_kwargs) -> DatasetBuilder:
    sub_dir = os.path.join(version, '_'.join(split))
    builder_instance = MsCsvDatasetBuilder(
        dataset_name=dataset_name,
        namespace=namespace,
        cache_dir=cache_dir,
        subset_name=subset_name,
        meta_data_files=meta_data_files,
        zip_data_files=zip_data_files,
        hash=sub_dir)
    meta_data_file = next(iter(meta_data_files.values()))
    if not meta_data_file:
        builder_instance = TaskSpecificDatasetBuilder(
            dataset_name=dataset_name,
            namespace=namespace,
            cache_dir=cache_dir,
            subset_name=subset_name,
            meta_data_files=meta_data_files,
            zip_data_files=zip_data_files,
            hash=sub_dir,
            **config_kwargs)
    elif meta_data_file.endswith('.csv'):
        builder_instance = MsCsvDatasetBuilder(
            dataset_name=dataset_name,
            namespace=namespace,
            cache_dir=cache_dir,
            subset_name=subset_name,
            meta_data_files=meta_data_files,
            zip_data_files=zip_data_files,
            hash=sub_dir)
    else:
        raise NotImplementedError(
            f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
        )

    return builder_instance
--- a/modelscope/outputs.py
+++ b/modelscope/outputs.py
@@ -188,6 +188,16 @@ TASK_OUTPUTS = {
    Tasks.body_2d_keypoints:
    [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],

    # video single object tracking result for single video
    # {
    #   "boxes": [
    #               [x1, y1, x2, y2],
    #               [x1, y1, x2, y2],
    #               [x1, y1, x2, y2],
    #             ]
    # }
    Tasks.video_single_object_tracking: [OutputKeys.BOXES],

    # live category recognition result for single video
    # {
    #       "scores": [0.885272, 0.014790631, 0.014558001],
@@ -405,7 +415,7 @@ TASK_OUTPUTS = {

    # audio processed for single file in PCM format
    # {
    #   "output_pcm": np.array with shape(samples,) and dtype float32
    #   "output_pcm": pcm encoded audio bytes
    # }
    Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
    Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
@@ -417,6 +427,19 @@ TASK_OUTPUTS = {
    # }
    Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM],

    # {
    #     "kws_list": [
    #         {
    #             'keyword': '',        # the keyword spotted
    #             'offset': 19.4,       # the keyword start time in second
    #             'length': 0.68,       # the keyword length in second
    #             'confidence': 0.85    # the possibility if it is the keyword
    #         },
    #         ...
    #     ]
    # }
    Tasks.keyword_spotting: [OutputKeys.KWS_LIST],

    # ============ multi-modal tasks ===================

    # image caption result for single sample
--- a/modelscope/pipelines/audio/init.py
+++ b/modelscope/pipelines/audio/init.py
@@ -6,6 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .ans_pipeline import ANSPipeline
    from .asr_inference_pipeline import AutomaticSpeechRecognitionPipeline
    from .kws_farfield_pipeline import KWSFarfieldPipeline
    from .kws_kwsbp_pipeline import KeyWordSpottingKwsbpPipeline
    from .linear_aec_pipeline import LinearAECPipeline
    from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline
@@ -14,6 +15,7 @@ else:
    _import_structure = {
        'ans_pipeline': ['ANSPipeline'],
        'asr_inference_pipeline': ['AutomaticSpeechRecognitionPipeline'],
        'kws_farfield_pipeline': ['KWSFarfieldPipeline'],
        'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'],
        'linear_aec_pipeline': ['LinearAECPipeline'],
        'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'],
--- a/modelscope/pipelines/audio/kws_farfield_pipeline.py
+++ b/modelscope/pipelines/audio/kws_farfield_pipeline.py
@@ -0,0 +1,81 @@
 import io
 import wave
 from typing import Any, Dict

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
    Tasks.keyword_spotting,
    module_name=Pipelines.speech_dfsmn_kws_char_farfield)
 class KWSFarfieldPipeline(Pipeline):
    r"""A Keyword Spotting Inference Pipeline .

    When invoke the class with pipeline.__call__(), it accept only one parameter:
        inputs(str): the path of wav file
    """
    SAMPLE_RATE = 16000
    SAMPLE_WIDTH = 2
    INPUT_CHANNELS = 3
    OUTPUT_CHANNELS = 2

    def __init__(self, model, **kwargs):
        """
        use `model` to create a kws far field pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        self.model = self.model.to(self.device)
        self.model.eval()
        frame_size = self.INPUT_CHANNELS * self.SAMPLE_WIDTH
        self._nframe = self.model.size_in // frame_size
        self.frame_count = 0

    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
        if isinstance(inputs, bytes):
            return dict(input_file=inputs)
        elif isinstance(inputs, Dict):
            return inputs
        else:
            raise ValueError(f'Not supported input type: {type(inputs)}')

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        input_file = inputs['input_file']
        if isinstance(input_file, bytes):
            input_file = io.BytesIO(input_file)
        self.frame_count = 0
        kws_list = []
        with wave.open(input_file, 'rb') as fin:
            if 'output_file' in inputs:
                with wave.open(inputs['output_file'], 'wb') as fout:
                    fout.setframerate(self.SAMPLE_RATE)
                    fout.setnchannels(self.OUTPUT_CHANNELS)
                    fout.setsampwidth(self.SAMPLE_WIDTH)
                    self._process(fin, kws_list, fout)
            else:
                self._process(fin, kws_list)
        return {OutputKeys.KWS_LIST: kws_list}

    def _process(self,
                 fin: wave.Wave_read,
                 kws_list,
                 fout: wave.Wave_write = None):
        data = fin.readframes(self._nframe)
        while len(data) >= self.model.size_in:
            self.frame_count += self._nframe
            result = self.model.forward_decode(data)
            if fout:
                fout.writeframes(result['pcm'])
            if 'kws' in result:
                result['kws']['offset'] += self.frame_count / self.SAMPLE_RATE
                kws_list.append(result['kws'])
            data = fin.readframes(self._nframe)

    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -255,7 +255,7 @@ class Pipeline(ABC):
                return self._collate_fn(torch.from_numpy(data))
        elif isinstance(data, torch.Tensor):
            return data.to(self.device)
        elif isinstance(data, (str, int, float, bool, type(None))):
        elif isinstance(data, (bytes, str, int, float, bool, type(None))):
            return data
        elif isinstance(data, InputFeatures):
            return data
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -124,12 +124,16 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.image_classification:
    (Pipelines.daily_image_classification,
     'damo/cv_vit-base_image-classification_Dailylife-labels'),
    Tasks.ocr_recognition: (Pipelines.ocr_recognition,
                            'damo/cv_convnextTiny_ocr-recognition_damo'),
    Tasks.ocr_recognition:
    (Pipelines.ocr_recognition,
     'damo/cv_convnextTiny_ocr-recognition-general_damo'),
    Tasks.skin_retouching: (Pipelines.skin_retouching,
                            'damo/cv_unet_skin-retouching'),
    Tasks.crowd_counting: (Pipelines.crowd_counting,
                           'damo/cv_hrnet_crowd-counting_dcanet'),
    Tasks.video_single_object_tracking:
    (Pipelines.video_single_object_tracking,
     'damo/cv_vitb_video-single-object-tracking_ostrack'),
 }


--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
    from .cmdssl_video_embedding_pipeline import CMDSSLVideoEmbeddingPipeline
    from .crowd_counting_pipeline import CrowdCountingPipeline
    from .image_detection_pipeline import ImageDetectionPipeline
    from .image_salient_detection_pipeline import ImageSalientDetectionPipeline
    from .face_detection_pipeline import FaceDetectionPipeline
    from .face_image_generation_pipeline import FaceImageGenerationPipeline
    from .face_recognition_pipeline import FaceRecognitionPipeline
@@ -43,6 +44,7 @@ else:
        'cmdssl_video_embedding_pipeline': ['CMDSSLVideoEmbeddingPipeline'],
        'crowd_counting_pipeline': ['CrowdCountingPipeline'],
        'image_detection_pipeline': ['ImageDetectionPipeline'],
        'image_salient_detection_pipeline': ['ImageSalientDetectionPipeline'],
        'face_detection_pipeline': ['FaceDetectionPipeline'],
        'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
        'face_recognition_pipeline': ['FaceRecognitionPipeline'],
--- a/modelscope/pipelines/cv/image_salient_detection_pipeline.py
+++ b/modelscope/pipelines/cv/image_salient_detection_pipeline.py
@@ -0,0 +1,47 @@
 from typing import Any, Dict

 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import Tasks


@PIPELINES.register_module(
    Tasks.image_segmentation, module_name=Pipelines.salient_detection)
 class ImageSalientDetectionPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
            model: model id on modelscope hub.
        """
        super().__init__(model=model, auto_collate=False, **kwargs)

    def preprocess(self, input: Input) -> Dict[str, Any]:

        img = LoadImage.convert_to_ndarray(input)
        img_h, img_w, _ = img.shape
        img = self.model.preprocess(img)
        result = {'img': img, 'img_w': img_w, 'img_h': img_h}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:

        outputs = self.model.inference(input['img'])
        result = {
            'data': outputs,
            'img_w': input['img_w'],
            'img_h': input['img_h']
        }
        return result

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:

        data = self.model.postprocess(inputs)
        outputs = {
            OutputKeys.SCORES: None,
            OutputKeys.LABELS: None,
            OutputKeys.MASKS: data
        }
        return outputs
--- a/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
+++ b/modelscope/pipelines/cv/video_single_object_tracking_pipeline.py
@@ -0,0 +1,80 @@
 import os.path as osp
 from typing import Any, Dict

 import cv2

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.video_single_object_tracking.config.ostrack import \
    cfg
 from modelscope.models.cv.video_single_object_tracking.tracker.ostrack import \
    OSTrack
 from modelscope.models.cv.video_single_object_tracking.utils.utils import \
    check_box
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.video_single_object_tracking,
    module_name=Pipelines.video_single_object_tracking)
 class VideoSingleObjectTrackingPipeline(Pipeline):

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a single object tracking pipeline
        Args:
            model: model id on modelscope hub.
        """
        super().__init__(model=model, **kwargs)
        self.cfg = cfg
        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
        logger.info(f'loading model from {ckpt_path}')
        self.tracker = OSTrack(ckpt_path, self.device)
        logger.info('init tracker done')

    def preprocess(self, input) -> Input:
        self.video_path = input[0]
        self.init_bbox = input[1]
        return input

    def forward(self, input: Input) -> Dict[str, Any]:
        output_boxes = []
        cap = cv2.VideoCapture(self.video_path)
        success, frame = cap.read()
        if success is False:
            raise Exception(
                'modelscope error: %s can not be decoded by OpenCV.' %
                (self.video_path))

        init_box = self.init_bbox
        frame_h, frame_w = frame.shape[0:2]
        if not check_box(init_box, frame_h, frame_w):
            raise Exception('modelscope error: init_box out of image range ',
                            init_box)
        output_boxes.append(init_box.copy())
        init_box[2] = init_box[2] - init_box[0]
        init_box[3] = init_box[3] - init_box[1]
        self.tracker.initialize(frame, {'init_bbox': init_box})
        logger.info('init bbox done')

        while True:
            ret, frame = cap.read()
            if frame is None:
                break
            out = self.tracker.track(frame)
            state = [int(s) for s in out['target_bbox']]
            output_boxes.append(state)
        cap.release()
        logger.info('tracking process done')

        return {
            OutputKeys.BOXES: output_boxes,
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -1,11 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union

 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
                                      Preprocessor)
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger

@@ -35,9 +39,19 @@ class ImageCaptioningPipeline(Pipeline):
        else:
            raise NotImplementedError
        pipe_model.model.eval()
        if preprocessor is None and isinstance(pipe_model, OfaForAllTasks):
            preprocessor = OfaPreprocessor(model_dir=pipe_model.model_dir)
        if preprocessor is None:
            if isinstance(pipe_model, OfaForAllTasks):
                preprocessor = OfaPreprocessor(pipe_model.model_dir)
            elif isinstance(pipe_model, MPlugForAllTasks):
                preprocessor = MPlugPreprocessor(pipe_model.model_dir)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
        if isinstance(self.model, OfaForAllTasks):
            return inputs
        return {OutputKeys.CAPTION: inputs}
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -5,13 +5,12 @@ import torch

 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.multi_modal import (MPlugForVisualQuestionAnswering,
                                           OfaForAllTasks)
 from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import (MPlugVisualQuestionAnsweringPreprocessor,
                                      OfaPreprocessor)
 from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
                                      Preprocessor)
 from modelscope.utils.constant import Tasks

 __all__ = ['VisualQuestionAnsweringPipeline']
@@ -23,9 +22,8 @@ __all__ = ['VisualQuestionAnsweringPipeline']
 class VisualQuestionAnsweringPipeline(Pipeline):

    def __init__(self,
                 model: Union[MPlugForVisualQuestionAnswering, str],
                 preprocessor: Optional[
                     MPlugVisualQuestionAnsweringPreprocessor] = None,
                 model: Union[Model, str],
                 preprocessor: Optional[Preprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a visual question answering pipeline for prediction

@@ -35,18 +33,12 @@ class VisualQuestionAnsweringPipeline(Pipeline):
        """
        model = model if isinstance(model,
                                    Model) else Model.from_pretrained(model)
        self.tokenizer = None
        if preprocessor is None:
            if isinstance(model, OfaForAllTasks):
                preprocessor = OfaPreprocessor(model.model_dir)
            elif isinstance(model, MPlugForVisualQuestionAnswering):
                preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
                    model.model_dir)
        if isinstance(model, MPlugForVisualQuestionAnswering):
            model.eval()
            self.tokenizer = model.tokenizer
        else:
            model.model.eval()
            elif isinstance(model, MPlugForAllTasks):
                preprocessor = MPlugPreprocessor(model.model_dir)
        model.model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)

    def forward(self, inputs: Dict[str, Any],
@@ -64,14 +56,6 @@ class VisualQuestionAnsweringPipeline(Pipeline):
        Returns:
            Dict[str, str]: the prediction results
        """
        if self.tokenizer is None:
        if isinstance(self.model, OfaForAllTasks):
            return inputs
        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))

        pred_string = self.tokenizer.decode(inputs[0][0])
        for _old, _new in replace_tokens_bert:
            pred_string = pred_string.replace(_old, _new)
        pred_string.strip()
        return {OutputKeys.TEXT: pred_string}
        return {OutputKeys.TEXT: inputs}
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -6,7 +6,7 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .base import Preprocessor
    from .builder import PREPROCESSORS, build_preprocessor
    from .common import Compose
    from .common import Compose, ToTensor, Filter
    from .asr import WavToScp
    from .audio import LinearAECAndFbank
    from .image import (LoadImage, load_image,
@@ -14,8 +14,7 @@ if TYPE_CHECKING:
                        ImageInstanceSegmentationPreprocessor,
                        ImageDenoisePreprocessor)
    from .kws import WavToLists
    from .multi_modal import (OfaPreprocessor,
                              MPlugVisualQuestionAnsweringPreprocessor)
    from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
    from .nlp import (Tokenize, SequenceClassificationPreprocessor,
                      TextGenerationPreprocessor,
                      TokenClassificationPreprocessor,
@@ -33,7 +32,7 @@ else:
    _import_structure = {
        'base': ['Preprocessor'],
        'builder': ['PREPROCESSORS', 'build_preprocessor'],
        'common': ['Compose'],
        'common': ['Compose', 'ToTensor', 'Filter'],
        'audio': ['LinearAECAndFbank'],
        'asr': ['WavToScp'],
        'video': ['ReadVideoData'],
@@ -42,8 +41,7 @@ else:
            'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
        ],
        'kws': ['WavToLists'],
        'multi_modal':
        ['OfaPreprocessor', 'MPlugVisualQuestionAnsweringPreprocessor'],
        'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
        'nlp': [
            'Tokenize', 'SequenceClassificationPreprocessor',
            'TextGenerationPreprocessor', 'TokenClassificationPreprocessor',
--- a/modelscope/preprocessors/common.py
+++ b/modelscope/preprocessors/common.py
@@ -2,6 +2,10 @@

 import time
 from collections.abc import Sequence
 from typing import Mapping

 import numpy as np
 import torch

 from .builder import PREPROCESSORS, build_preprocessor

@@ -25,12 +29,18 @@ class Compose(object):
            if isinstance(transform, dict):
                if self.field_name is None:
                    transform = build_preprocessor(transform, field_name)
                self.transforms.append(transform)
                else:
                    # if not found key in field_name, try field_name=None(default_group)
                    try:
                        transform = build_preprocessor(transform, field_name)
                    except KeyError:
                        transform = build_preprocessor(transform, None)
            elif callable(transform):
                self.transforms.append(transform)
                pass
            else:
                raise TypeError('transform must be callable or a dict, but got'
                                f' {type(transform)}')
            self.transforms.append(transform)

    def __call__(self, data):
        for t in self.transforms:
@@ -52,3 +62,82 @@ class Compose(object):
            format_string += f'\n    {t}'
        format_string += '\n)'
        return format_string


 def to_tensor(data):
    """Convert objects of various python types to :obj:`torch.Tensor`.

    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
    :class:`Sequence`, :class:`int` and :class:`float`.

    Args:
        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
            be converted.
    """

    if isinstance(data, torch.Tensor):
        return data
    elif isinstance(data, np.ndarray):
        return torch.from_numpy(data)
    elif isinstance(data, Sequence) and not isinstance(data, str):
        return torch.tensor(data)
    elif isinstance(data, int):
        return torch.LongTensor([data])
    elif isinstance(data, float):
        return torch.FloatTensor([data])
    else:
        raise TypeError(f'type {type(data)} cannot be converted to tensor.')


@PREPROCESSORS.register_module()
 class ToTensor(object):
    """Convert target object to tensor.

    Args:
        keys (Sequence[str]): Key of data to be converted to Tensor.
            Only valid when data is type of `Mapping`. If `keys` is None,
            all values of keys will be converted to tensor by default.
    """

    def __init__(self, keys=None):
        self.keys = keys

    def __call__(self, data):
        if isinstance(data, Mapping):
            if self.keys is None:
                self.keys = list(data.keys())

            for key in self.keys:
                data[key] = to_tensor(data[key])
        else:
            data = to_tensor(data)

        return data

    def __repr__(self):
        return self.__class__.__name__ + f'(keys={self.keys})'


@PREPROCESSORS.register_module()
 class Filter(object):
    """This is usually the last stage of the dataloader transform.
    Only data of reserved keys will be kept and passed directly to the model, others will be removed.

    Args:
        keys (Sequence[str]): Keys of data to be reserved, others will be removed.
    """

    def __init__(self, reserved_keys):
        self.reserved_keys = reserved_keys

    def __call__(self, data):
        assert isinstance(data, Mapping)

        reserved_data = {}
        for key in self.reserved_keys:
            reserved_data[key] = data[key]

        return reserved_data

    def __repr__(self):
        return self.__class__.__name__ + f'(keys={self.reserved_keys})'
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -151,6 +151,11 @@ class ImageDenoisePreprocessor(Preprocessor):
        super().__init__(*args, **kwargs)
        self.model_dir: str = model_dir

        from .common import Filter

        # TODO: `Filter` should be moved to configurarion file of each model
        self._transforms = [Filter(reserved_keys=['input', 'target'])]

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """process the raw input data

@@ -160,6 +165,9 @@ class ImageDenoisePreprocessor(Preprocessor):
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        for t in self._transforms:
            data = t(data)

        return data


--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -19,7 +19,7 @@ from .ofa.utils.collate import collate_fn

 __all__ = [
    'OfaPreprocessor',
    'MPlugVisualQuestionAnsweringPreprocessor',
    'MPlugPreprocessor',
 ]


@@ -28,7 +28,7 @@ __all__ = [
 class OfaPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
@@ -102,39 +102,55 @@ class OfaPreprocessor(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.multi_modal,
    module_name=Preprocessors.mplug_visual_question_answering)
 class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
    Fields.multi_modal, module_name=Preprocessors.mplug_tasks_preprocessor)
 class MPlugPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via 'bert-base-uncased' tokenizer and configuration

        """
        from transformers import BertTokenizer
        from modelscope.models.multi_modal.mplug import CONFIG_NAME, VOCAB_NAME, MPlugConfig

        super().__init__(*args, **kwargs)
        self.model_dir = model_dir

        # tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(
            osp.join(model_dir, VOCAB_NAME))
        self._tokenizer = None
        self._patch_resize_transform = None

        # load configuration
        config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME))
    @property
    def tokenizer(self):
        from transformers import BertTokenizer

        # Initialize transform
        from torchvision import transforms
        mean = (0.48145466, 0.4578275, 0.40821073)
        std = (0.26862954, 0.26130258, 0.27577711)
        if self._tokenizer is None:
            self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
        return self._tokenizer

    @property
    def patch_resize_transform(self):
        if self._patch_resize_transform is None:
            from torchvision import transforms
            from modelscope.models.multi_modal.mplug import CONFIG_NAME, MPlugConfig

            config = MPlugConfig.from_yaml_file(
                osp.join(self.model_dir, CONFIG_NAME))

            mean = (0.48145466, 0.4578275, 0.40821073)
            std = (0.26862954, 0.26130258, 0.27577711)

            self._patch_resize_transform = transforms.Compose([
                transforms.Resize((config.image_res, config.image_res),
                                  interpolation=Image.BICUBIC),
                transforms.ToTensor(),
                transforms.Normalize(mean=mean, std=std),
            ])
        return self._patch_resize_transform

    def __call__(self, *args, **kwargs):
        call_mapping = {
            Tasks.visual_question_answering: self.vqa_call,
            Tasks.image_captioning: self.caption_call
        }

        self.patch_resize_transform = transforms.Compose([
            transforms.Resize((config.image_res, config.image_res),
                              interpolation=Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std),
        ])
        self.cfg = Config.from_file(
            osp.join(self.model_dir, ModelFile.CONFIGURATION))
        return call_mapping[self.cfg.task](*args, **kwargs)

    def __call__(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
    def vqa_call(self, data: Union[tuple, Dict[str, Any]]) -> Dict[str, Any]:
        image: Image.Image = data[0] if isinstance(data,
                                                   tuple) else data['image']
        question: str = data[1] if isinstance(data,
@@ -147,3 +163,19 @@ class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
                                  return_tensors='pt')

        return {'image': image, 'question': question, 'train': False}

    def caption_call(
            self, data: Union[Image.Image, tuple,
                              Dict[str, Any]]) -> Dict[str, Any]:
        if isinstance(data, Image.Image):
            image = data
        elif isinstance(data, tuple):
            image = data[0]
        else:
            image = data['image']
        image = image.convert('RGB')
        image = self.patch_resize_transform(image)
        image = torch.stack([image], dim=0)
        question = self.tokenizer('', return_tensors='pt')

        return {'image': image, 'question': question, 'train': False}
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -4,6 +4,7 @@ import os.path as osp
 import uuid
 from typing import Any, Dict, Iterable, Optional, Tuple, Union

 import numpy as np
 from transformers import AutoTokenizer

 from modelscope.metainfo import Models, Preprocessors
@@ -43,7 +44,7 @@ class Tokenize(Preprocessor):
 class SequenceClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
@@ -191,6 +192,10 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
            text_b,
            return_tensors='pt' if self._mode == ModeKeys.INFERENCE else None,
            **self.tokenize_kwargs)
        output = {
            k: np.array(v) if isinstance(v, list) else v
            for k, v in output.items()
        }
        self.labels_to_id(labels, output)
        return output

@@ -240,13 +245,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
        if labels is not None:
            if isinstance(labels, Iterable) and all([label_can_be_mapped(label) for label in labels]) \
                    and self.label2id is not None:
                output[OutputKeys.LABEL] = [
                output[OutputKeys.LABELS] = [
                    self.label2id[str(label)] for label in labels
                ]
            elif label_can_be_mapped(labels) and self.label2id is not None:
                output[OutputKeys.LABEL] = self.label2id[str(labels)]
                output[OutputKeys.LABELS] = self.label2id[str(labels)]
            else:
                output[OutputKeys.LABEL] = labels
                output[OutputKeys.LABELS] = labels


@PREPROCESSORS.register_module(
@@ -286,7 +291,7 @@ class ZeroShotClassificationPreprocessor(NLPTokenizerPreprocessorBase):
    """

    def __init__(self, model_dir: str, mode=ModeKeys.INFERENCE, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
@@ -517,7 +522,7 @@ class NERPreprocessor(Preprocessor):
    """

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
@@ -609,7 +614,7 @@ class TextErrorCorrectionPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        from fairseq.data import Dictionary
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data via the vocab file from the `model_dir` path

        Args:
            model_dir (str): model path
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
@@ -22,7 +22,7 @@ __all__ = ['DialogIntentPredictionPreprocessor']
 class DialogIntentPredictionPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -20,7 +20,7 @@ __all__ = ['DialogModelingPreprocessor']
 class DialogModelingPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
--- a/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_state_tracking_preprocessor.py
@@ -17,7 +17,7 @@ __all__ = ['DialogStateTrackingPreprocessor']
 class DialogStateTrackingPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -8,6 +8,7 @@ from itertools import chain
 import numpy as np

 from modelscope.preprocessors.space.tokenizer import Tokenizer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
 from modelscope.utils.nlp.space import ontology, utils
 from modelscope.utils.nlp.space.db_ops import MultiWozDB
@@ -343,7 +344,7 @@ class MultiWOZBPETextField(BPETextField):
        ]
        special_tokens.extend(self.add_sepcial_tokens())
        self.tokenizer = Tokenizer(
            vocab_path=os.path.join(model_dir, 'vocab.txt'),
            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
            special_tokens=special_tokens,
            tokenizer_type=config.BPETextField.tokenizer_type)
        self.understand_ids = self.tokenizer.convert_tokens_to_ids(
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -14,6 +14,7 @@ import numpy as np
 from tqdm import tqdm

 from modelscope.preprocessors.space.tokenizer import Tokenizer
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.nlp.space import ontology
 from modelscope.utils.nlp.space.scores import hierarchical_set_score
 from modelscope.utils.nlp.space.utils import list2np
@@ -50,7 +51,7 @@ class BPETextField(object):
        ]
        special_tokens.extend(self.add_sepcial_tokens())
        self.tokenizer = Tokenizer(
            vocab_path=os.path.join(model_dir, 'vocab.txt'),
            vocab_path=os.path.join(model_dir, ModelFile.VOCAB_FILE),
            special_tokens=special_tokens,
            tokenizer_type=config.BPETextField.tokenizer_type)
        self.understand_ids = self.numericalize(self.understand_tokens)
--- a/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
+++ b/modelscope/preprocessors/star/conversational_text_to_sql_preprocessor.py
@@ -28,7 +28,7 @@ __all__ = ['ConversationalTextToSqlPreprocessor']
 class ConversationalTextToSqlPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path
        """preprocess the data

        Args:
            model_dir (str): model path
--- a/modelscope/preprocessors/star/fields/common_utils.py
+++ b/modelscope/preprocessors/star/fields/common_utils.py
@@ -193,6 +193,15 @@ class SubPreprocessor():

        from nltk import data
        data.path.append(os.path.join(self.model_dir, 'nltk_data'))

        zippath = os.path.join(self.model_dir, 'nltk_data/tokenizers/punkt')
        if os.path.exists(zippath):
            print('punkt has already exist!')
        else:
            import zipfile
            with zipfile.ZipFile(zippath + '.zip') as zf:
                zf.extractall(
                    os.path.join(self.model_dir, 'nltk_data/tokenizers/'))
        question = nltk.word_tokenize(question)
        question = mwtokenizer.tokenize(question)

--- a/modelscope/trainers/cv/image_instance_segmentation_trainer.py
+++ b/modelscope/trainers/cv/image_instance_segmentation_trainer.py
@@ -22,7 +22,3 @@ class ImageInstanceSegmentationTrainer(EpochBasedTrainer):

    def prediction_step(self, model, inputs):
        pass

    def to_task_dataset(self, datasets, mode, preprocessor=None):
        # wait for dataset interface to become stable...
        return datasets.to_torch_dataset(preprocessor)
--- a/modelscope/trainers/cv/image_portrait_enhancement_trainer.py
+++ b/modelscope/trainers/cv/image_portrait_enhancement_trainer.py
@@ -40,7 +40,6 @@ class ImagePortraitEnhancementTrainer(EpochBasedTrainer):

        train_outputs = dict()
        self._mode = ModeKeys.TRAIN
        inputs = self.collate_fn(inputs)
        # call model forward but not __call__ to skip postprocess
        if isinstance(inputs, Mapping):
            d_loss = model._train_forward_d(**inputs)
--- a/modelscope/trainers/hooks/hook.py
+++ b/modelscope/trainers/hooks/hook.py
@@ -192,7 +192,7 @@ class Hook:
        Whether to reach the end of every epoch
        Returns: bool
        """
        return trainer.inner_iter + 1 == len(trainer.data_loader)
        return trainer.inner_iter + 1 == trainer.iters_per_epoch

    def is_last_epoch(self, trainer):
        """
--- a/modelscope/trainers/hooks/logger/text_logger_hook.py
+++ b/modelscope/trainers/hooks/logger/text_logger_hook.py
@@ -93,7 +93,7 @@ class TextLoggerHook(LoggerHook):
                lr_str = f'{lr_key}: {log_dict[lr_key]:.3e}'

            if self.by_epoch:
                log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{len(trainer.data_loader)}]\t'
                log_str = f'{epoch_key} [{log_dict[epoch_key]}][{log_dict[iter_key]}/{trainer.iters_per_epoch}]\t'
            else:
                log_str = f'{iter_key} [{log_dict[iter_key]}/{trainer.max_iters}]\t'
            log_str += f'{lr_str}, '
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -110,9 +110,11 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
        self.train_keys = build_dataset_keys(
            self.cfg.dataset.train if hasattr(self.cfg, 'dataset')
            and hasattr(self.cfg.dataset, 'train') else None)
        # TODO eval may has special keys, which is now not supported.
        # because there is only one preprocessor in the trainer, and it only supports one group of keys.
        self.eval_keys = self.train_keys
        self.eval_keys = build_dataset_keys(
            self.cfg.dataset.val if hasattr(self.cfg, 'dataset')
            and hasattr(self.cfg.dataset, 'val') else None)
        if len(self.eval_keys) == 0:
            self.eval_keys = self.train_keys

        super().__init__(
            model=model_dir,
@@ -148,7 +150,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
        elif isinstance(model, nn.Module):
            return model

    def build_preprocessor(self) -> Preprocessor:
    def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
        """Build the preprocessor.

        User can override this method to implement custom logits.
@@ -159,16 +161,38 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
        model_args = {} if self.label2id is None else {
            'label2id': self.label2id
        }
        cfg = ConfigDict({
            **getattr(self.cfg, 'preprocessor'),
            'model_dir':
            self.model_dir,
            **model_args,
            'mode':
            ModeKeys.TRAIN,
            **self.train_keys,
        })
        return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))

        field_name = Tasks.find_field_by_task(self.cfg.task)
        train_preprocessor, eval_preprocessor = None, None
        _train_cfg, _eval_cfg = {}, {}

        if 'type' not in self.cfg.preprocessor and (
                'train' in self.cfg.preprocessor
                or 'val' in self.cfg.preprocessor):
            if 'train' in self.cfg.preprocessor:
                _train_cfg = self.cfg.preprocessor.train
            if 'val' in self.cfg.preprocessor:
                _eval_cfg = self.cfg.preprocessor.val
        else:
            _train_cfg = self.cfg.preprocessor
            _eval_cfg = self.cfg.preprocessor

        if len(_train_cfg):
            _train_cfg.update({
                'model_dir': self.model_dir,
                **model_args,
                **self.train_keys, 'mode': ModeKeys.TRAIN
            })
            train_preprocessor = build_preprocessor(_train_cfg, field_name)
        if len(_eval_cfg):
            _eval_cfg.update({
                'model_dir': self.model_dir,
                **model_args,
                **self.eval_keys, 'mode': ModeKeys.EVAL
            })
            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

        return train_preprocessor, eval_preprocessor


@TRAINERS.register_module(module_name=Trainers.nlp_veco_trainer)
@@ -178,7 +202,7 @@ class VecoTrainer(NlpEpochBasedTrainer):
        """Veco evaluates the datasets one by one.

        """
        from modelscope.task_datasets import VecoDataset
        from modelscope.msdatasets.task_datasets import VecoDataset
        self.model.eval()
        self._mode = ModeKeys.EVAL
        metric_values = {}
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -5,15 +5,15 @@ import time
 from collections.abc import Mapping
 from distutils.version import LooseVersion
 from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union

 import json
 import numpy as np
 import torch
 from addict import Dict
 from torch import distributed as dist
 from torch import nn
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.dataloader import default_collate
 from torch.utils.data.distributed import DistributedSampler

 from modelscope.hub.snapshot_download import snapshot_download
@@ -21,23 +21,26 @@ from modelscope.metainfo import Trainers
 from modelscope.metrics import build_metric, task_default_metrics
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.preprocessors import build_preprocessor
 from modelscope.msdatasets.task_datasets.builder import build_task_dataset
 from modelscope.msdatasets.task_datasets.torch_base_dataset import \
    TorchTaskDataset
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.task_datasets.builder import build_task_dataset
 from modelscope.task_datasets.torch_base_dataset import TorchTaskDataset
 from modelscope.preprocessors.builder import build_preprocessor
 from modelscope.preprocessors.common import Compose
 from modelscope.trainers.hooks.builder import HOOKS
 from modelscope.trainers.hooks.priority import Priority, get_priority
 from modelscope.trainers.lrscheduler.builder import build_lr_scheduler
 from modelscope.trainers.optimizer.builder import build_optimizer
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Hubs, ModeKeys,
                                       ModelFile, Tasks, TrainerStages)
 from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields,
                                       ConfigKeys, Hubs, ModeKeys, ModelFile,
                                       Tasks, TrainerStages)
 from modelscope.utils.data_utils import to_device
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import build_from_cfg
 from modelscope.utils.tensor_utils import torch_default_data_collator
 from modelscope.utils.torch_utils import (broadcast, create_device,
                                          get_dist_info, init_dist)
 from modelscope.utils.torch_utils import (create_device, get_dist_info,
                                          init_dist)
 from .base import BaseTrainer
 from .builder import TRAINERS
 from .default_config import DEFAULT_CONFIG
@@ -83,7 +86,8 @@ class EpochBasedTrainer(BaseTrainer):
            data_collator: Optional[Callable] = None,
            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
            preprocessor: Optional[Preprocessor] = None,
            preprocessor: Optional[Union[Preprocessor,
                                         Dict[str, Preprocessor]]] = None,
            optimizers: Tuple[torch.optim.Optimizer,
                              torch.optim.lr_scheduler._LRScheduler] = (None,
                                                                        None),
@@ -120,24 +124,46 @@ class EpochBasedTrainer(BaseTrainer):
        else:
            self.work_dir = self.cfg.train.get('work_dir', './work_dir')

        self.preprocessor = None
        self.train_preprocessor, self.eval_preprocessor = None, None
        if isinstance(preprocessor, Preprocessor):
            self.preprocessor = preprocessor
        elif hasattr(self.cfg, 'preprocessor'):
            self.preprocessor = self.build_preprocessor()
        if self.preprocessor is not None:
            self.preprocessor.mode = ModeKeys.TRAIN
            self.train_preprocessor = preprocessor
            self.eval_preprocessor = preprocessor
        elif isinstance(preprocessor, Mapping):
            if not (ConfigKeys.train in preprocessor
                    or ConfigKeys.val in preprocessor):
                raise ValueError(
                    f'Preprocessor must split with `{ConfigKeys.train}` and `{ConfigKeys.val}` keys!'
                )
            if ConfigKeys.train in preprocessor:
                assert isinstance(preprocessor[ConfigKeys.train], Preprocessor)
                self.train_preprocessor = preprocessor[ConfigKeys.train]
            if ConfigKeys.val in preprocessor:
                assert isinstance(preprocessor[ConfigKeys.val], Preprocessor)
                self.eval_preprocessor = preprocessor[ConfigKeys.val]
        elif hasattr(self.cfg, ConfigFields.preprocessor):
            self.train_preprocessor, self.eval_preprocessor = self.build_preprocessor(
            )

        if self.train_preprocessor is not None:
            self.train_preprocessor.mode = ModeKeys.TRAIN
        if self.eval_preprocessor is not None:
            self.eval_preprocessor.mode = ModeKeys.EVAL

        device_name = kwargs.get('device', 'gpu')
        assert device_name in ['gpu',
                               'cpu'], 'device should be either cpu or gpu.'
        self.device = create_device(device_name == 'cpu')

        self.train_dataset = self.to_task_dataset(
            train_dataset, mode=ModeKeys.TRAIN, preprocessor=self.preprocessor)
            train_dataset,
            mode=ModeKeys.TRAIN,
            preprocessor=self.train_preprocessor)
        self.eval_dataset = self.to_task_dataset(
            eval_dataset, mode=ModeKeys.EVAL, preprocessor=self.preprocessor)
            eval_dataset,
            mode=ModeKeys.EVAL,
            preprocessor=self.eval_preprocessor)

        self.data_collator = data_collator if data_collator is not None else torch_default_data_collator
        self.data_collator = data_collator if data_collator is not None else default_collate
        self.metrics = self.get_metrics()
        self._metric_values = None
        self.optimizers = optimizers
@@ -155,6 +181,16 @@ class EpochBasedTrainer(BaseTrainer):
        else:
            self._max_epochs = kwargs['max_epochs']

        self._train_iters_per_epoch = kwargs.get('train_iters_per_epoch', None)
        self._eval_iters_per_epoch = kwargs.get('val_iters_per_epoch', None)
        if self._train_iters_per_epoch is None and hasattr(
                self.cfg.train, 'train_iters_per_epoch'):
            self._train_iters_per_epoch = self.cfg.train.train_iters_per_epoch
        if self._eval_iters_per_epoch is None and hasattr(
                self.cfg, 'evaluation') and hasattr(self.cfg.evaluation,
                                                    'val_iters_per_epoch'):
            self._eval_iters_per_epoch = self.cfg.evaluation.val_iters_per_epoch

        self.use_fp16 = kwargs.get('use_fp16', False)

        # TODO @wenmeng.zwm add seed init fn
@@ -211,7 +247,32 @@ class EpochBasedTrainer(BaseTrainer):
    @property
    def max_iters(self):
        """int: Maximum training iterations."""
        return self._max_epochs * len(self.data_loader)
        return self._max_epochs * self.iters_per_epoch

    @property
    def iters_per_epoch(self):
        """int: Total iterations of one epoch"""

        def _get_data_len(data_loader):
            try:
                return len(data_loader)
            except Exception as e:
                self.logger.error(e)
                raise ValueError(
                    'Please implement ``__len__`` method for your dataset, '
                    'or add `train_iters_per_epoch` and `train_iters_per_epoch` '
                    'to your configuration file or kwargs')

        if self.mode == ModeKeys.TRAIN:
            if self._train_iters_per_epoch is not None:
                return self._train_iters_per_epoch
            else:
                return _get_data_len(self.train_dataloader)
        elif self.mode == ModeKeys.EVAL:
            if self._eval_iters_per_epoch is not None:
                return self._eval_iters_per_epoch
            else:
                return _get_data_len(self.eval_dataloader)

    def to_task_dataset(self,
                        datasets: Union[Dataset, List[Dataset]],
@@ -228,14 +289,21 @@ class EpochBasedTrainer(BaseTrainer):
            if isinstance(datasets, TorchTaskDataset):
                return datasets
            elif isinstance(datasets, MsDataset):
                datasets = datasets.to_torch_dataset(
                    preprocessors=self.preprocessor)
                return datasets
                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
                    else ConfigDict(type=None, mode=mode)
                return datasets.to_torch_dataset(
                    task_data_config=cfg,
                    task_name=self.cfg.task,
                    preprocessors=preprocessor)
            elif isinstance(datasets, List) and isinstance(
                    datasets[0], MsDataset):
                cfg = ConfigDict(type=self.cfg.model.type, mode=mode) if hasattr(self.cfg, ConfigFields.model) \
                    else ConfigDict(type=None, mode=mode)
                datasets = [
                    d.to_torch_dataset(preprocessor=self.preprocessor)
                    for d in datasets
                    d.to_torch_dataset(
                        task_data_config=cfg,
                        task_name=self.cfg.task,
                        preprocessors=preprocessor) for d in datasets
                ]
                cfg = ConfigDict(
                    type=self.cfg.task, mode=mode, datasets=datasets)
@@ -258,24 +326,44 @@ class EpochBasedTrainer(BaseTrainer):
            else:
                return datasets

    def build_preprocessor(self) -> Preprocessor:
        """Build the preprocessor.
    def build_preprocessor(self) -> Tuple[Preprocessor, Preprocessor]:
        """Build train and eval preprocessor.

        User can override this method to implement custom logits.

        Returns: The preprocessor instance.
        Returns: The train preprocessor and eval preprocessor instance.

        """
        # TODO @wenmeng.zwm @jiangnana.jnn add support for different preprocessor
        # when they are different ones in training and evaluation
        cfg = ConfigDict({
            **getattr(self.cfg, 'preprocessor'),
            'model_dir':
            self.model_dir,
            'mode':
            ModeKeys.TRAIN,
        })
        return build_preprocessor(cfg, Tasks.find_field_by_task(self.cfg.task))
        field_name = Tasks.find_field_by_task(self.cfg.task)
        train_preprocessor, eval_preprocessor = None, None
        _train_cfg, _eval_cfg = {}, {}
        _dafault_args = {'model_dir': self.model_dir}

        if 'type' not in self.cfg.preprocessor and (
                'train' in self.cfg.preprocessor
                or 'val' in self.cfg.preprocessor):
            if 'train' in self.cfg.preprocessor:
                _train_cfg = self.cfg.preprocessor.train
            if 'val' in self.cfg.preprocessor:
                _eval_cfg = self.cfg.preprocessor.val
        else:
            _train_cfg = self.cfg.preprocessor
            _eval_cfg = self.cfg.preprocessor

        if len(_train_cfg):
            if isinstance(_train_cfg, Sequence):
                # TODO: for Sequence, need adapt to `mode` and `mode_dir` args,
                # and add mode for Compose or other plans
                raise NotImplementedError('Not supported yet!')
            _train_cfg.update(_dafault_args)
            train_preprocessor = build_preprocessor(_train_cfg, field_name)
        if len(_eval_cfg):
            if isinstance(_eval_cfg, Sequence):
                raise NotImplementedError('Not supported yet!')
            _eval_cfg.update(_dafault_args)
            eval_preprocessor = build_preprocessor(_eval_cfg, field_name)

        return train_preprocessor, eval_preprocessor

    def get_metrics(self) -> List[str]:
        """Get the metric class types.
@@ -373,34 +461,6 @@ class EpochBasedTrainer(BaseTrainer):

        return build_parallel(dp_cfg)

    def collate_fn(self, data):
        """Prepare the input just before the forward function.
        This method will move the tensors to the right device.
        Usually this method does not need to be overridden.

        Args:
            data: The data out of the dataloader.

        Returns: The processed data.

        """
        from torch.utils.data.dataloader import default_collate
        if isinstance(data, dict) or isinstance(data, Mapping):
            return type(data)({k: self.collate_fn(v) for k, v in data.items()})
        elif isinstance(data, (tuple, list)):
            if isinstance(data[0], (int, float)):
                return default_collate(data).to(self.device)
            else:
                return type(data)(self.collate_fn(v) for v in data)
        elif isinstance(data, np.ndarray):
            return self.collate_fn(torch.from_numpy(data))
        elif isinstance(data, torch.Tensor):
            return data.to(self.device)
        elif isinstance(data, (str, int, float, bool)):
            return data
        else:
            raise ValueError(f'Unsupported data type {type(data)}')

    def train_step(self, model, inputs):
        """ Perform a training step on a batch of inputs.

@@ -421,7 +481,6 @@ class EpochBasedTrainer(BaseTrainer):
        # TODO: find more pretty way to change mode
        model.train()
        self._mode = ModeKeys.TRAIN
        inputs = self.collate_fn(inputs)
        # call model forward but not __call__ to skip postprocess
        if isinstance(inputs,
                      Mapping) and not func_receive_dict_inputs(model.forward):
@@ -486,7 +545,9 @@ class EpochBasedTrainer(BaseTrainer):
        if self.train_dataset is None:
            train_data = self.cfg.dataset.train
            self.train_dataset = self.build_dataset(
                train_data, mode=ModeKeys.TRAIN)
                train_data,
                mode=ModeKeys.TRAIN,
                preprocessor=self.train_preprocessor)

        data_loader = self._build_dataloader_with_dataset(
            self.train_dataset,
@@ -505,7 +566,9 @@ class EpochBasedTrainer(BaseTrainer):
        if self.eval_dataset is None:
            val_data = self.cfg.dataset.val
            self.eval_dataset = self.build_dataset(
                val_data, mode=ModeKeys.EVAL)
                val_data,
                mode=ModeKeys.EVAL,
                preprocessor=self.eval_preprocessor)

        batch_size = self.cfg.evaluation.batch_size
        workers = self.cfg.evaluation.workers
@@ -521,7 +584,7 @@ class EpochBasedTrainer(BaseTrainer):
        )
        return data_loader

    def build_dataset(self, data_cfg, mode):
    def build_dataset(self, data_cfg, mode, preprocessor=None):
        """ Build torch dataset object using data config
        """
        dataset = MsDataset.load(
@@ -530,9 +593,13 @@ class EpochBasedTrainer(BaseTrainer):
            subset_name=data_cfg.subset_name if hasattr(
                data_cfg, 'subset_name') else None,
            hub=data_cfg.hub if hasattr(data_cfg, 'hub') else Hubs.modelscope,
            **data_cfg,
        )
        cfg = ConfigDict(type=self.cfg.model.type, mode=mode)
        torch_dataset = dataset.to_torch_dataset(
            preprocessors=self.preprocessor, )
            task_data_config=cfg,
            task_name=self.cfg.task,
            preprocessors=self.preprocessor)
        dataset = self.to_task_dataset(torch_dataset, mode)
        return dataset

@@ -698,6 +765,7 @@ class EpochBasedTrainer(BaseTrainer):
            self.invoke_hook(TrainerStages.before_train_epoch)
            time.sleep(2)  # Prevent possible deadlock during epoch transition
            for i, data_batch in enumerate(data_loader):
                data_batch = to_device(data_batch, self.device)
                self.data_batch = data_batch
                self._inner_iter = i
                self.invoke_hook(TrainerStages.before_train_iter)
@@ -706,6 +774,9 @@ class EpochBasedTrainer(BaseTrainer):
                del self.data_batch
                self._iter += 1

                if i + 1 >= self.iters_per_epoch:
                    break

            self.invoke_hook(TrainerStages.after_train_epoch)
            self._epoch += 1

@@ -721,17 +792,21 @@ class EpochBasedTrainer(BaseTrainer):
            metric_values = multi_gpu_test(
                self.model,
                data_loader,
                device=self.device,
                tmpdir=None,
                gpu_collect=False,
                data_collate_fn=self.collate_fn,
                metric_classes=metric_classes)
                metric_classes=metric_classes,
                data_loader_iters_per_gpu=self.iters_per_epoch)
        else:
            from modelscope.trainers.utils.inference import single_gpu_test
            metric_values = single_gpu_test(
                self.model,
                data_loader,
                data_collate_fn=self.collate_fn,
                metric_classes=metric_classes)
                device=self.device,
                metric_classes=metric_classes,
                data_loader_iters=self.iters_per_epoch)

        self._inner_iter = self.iters_per_epoch - 1  # start from index 0

        return metric_values