From d4692b5ada73936b119daf440f9409f06524c04d Mon Sep 17 00:00:00 2001
From: "xixing.tj" <xixing.tj@alibaba-inc.com>
Date: Tue, 28 Jun 2022 14:03:01 +0800
Subject: [PATCH 1/9] [to #42322933]Merge branch 'master' into
 ocr/ocr_detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复master分支ocr_detection 单元测试bug
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9112290

    * create ocr_detection task

* fix code check error

* fix code check error

* fix code check issue

* fix code check issue

* replace c++ nms with python version

* fix code check issue

* fix code check issue

* rename maas_lib

* merge master to ocr/ocr_detection

* add model_hub sup for ocr_detection

* fix bug

* replace c++ decoder with python version

* fix bug

* Merge branch 'master' into ocr/ocr_detection

* merge master

* fix code check

* update

* add requirements for ocr_detection

* fix model_hub fetch bug

* remove debug code

* Merge branch 'master' into ocr/ocr_detection

* add local test image for ocr_detection

* update requirements for model_hub

* Merge branch 'master' into ocr/ocr_detection

* fix bug for full case test

* remove ema for ocr_detection

* Merge branch 'master' into ocr/ocr_detection

* apply ocr_detection test case

* Merge branch 'master' into ocr/ocr_detection

* update slim dependency for ocr_detection

* add more test case for ocr_detection

* release tf graph before create

* recover ema for ocr_detection model

* fix code

* Merge branch 'master' into ocr/ocr_detection

* fix code
---
 .../pipelines/cv/ocr_detection_pipeline.py    | 94 ++++++++++---------
 .../model_resnet_mutex_v4_linewithchar.py     |  6 +-
 .../pipelines/cv/ocr_utils/resnet18_v1.py     |  6 +-
 .../pipelines/cv/ocr_utils/resnet_utils.py    |  6 +-
 tests/pipelines/test_ocr_detection.py         |  5 +
 5 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 0502fe36..4856b06b 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -8,7 +8,6 @@ import cv2
 import numpy as np
 import PIL
 import tensorflow as tf
-import tf_slim as slim
 
 from modelscope.metainfo import Pipelines
 from modelscope.pipelines.base import Input
@@ -19,6 +18,11 @@ from ..base import Pipeline
 from ..builder import PIPELINES
 from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils
 
+if tf.__version__ >= '2.0':
+    import tf_slim as slim
+else:
+    from tensorflow.contrib import slim
+
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 tf.compat.v1.disable_eager_execution()
@@ -44,6 +48,7 @@ class OCRDetectionPipeline(Pipeline):
 
     def __init__(self, model: str):
         super().__init__(model=model)
+        tf.reset_default_graph()
         model_path = osp.join(
             osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
             'checkpoint-80000')
@@ -51,51 +56,56 @@ class OCRDetectionPipeline(Pipeline):
         config = tf.ConfigProto(allow_soft_placement=True)
         config.gpu_options.allow_growth = True
         self._session = tf.Session(config=config)
-        global_step = tf.get_variable(
-            'global_step', [],
-            initializer=tf.constant_initializer(0),
-            dtype=tf.int64,
-            trainable=False)
-        variable_averages = tf.train.ExponentialMovingAverage(
-            0.997, global_step)
         self.input_images = tf.placeholder(
             tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
         self.output = {}
 
-        # detector
-        detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
-        all_maps = detector.build_model(self.input_images, is_training=False)
-
-        # decode local predictions
-        all_nodes, all_links, all_reg = [], [], []
-        for i, maps in enumerate(all_maps):
-            cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
-            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
-
-            cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
-
-            lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2])
-            lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:])
-            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
-
-            all_nodes.append(cls_prob)
-            all_links.append(lnk_prob)
-            all_reg.append(reg_maps)
-
-        # decode segments and links
-        image_size = tf.shape(self.input_images)[1:3]
-        segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
-            image_size,
-            all_nodes,
-            all_links,
-            all_reg,
-            anchor_sizes=list(detector.anchor_sizes))
-
-        # combine segments
-        combined_rboxes, combined_counts = ops.combine_segments_python(
-            segments, group_indices, segment_counts)
-        self.output['combined_rboxes'] = combined_rboxes
-        self.output['combined_counts'] = combined_counts
+        with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+            global_step = tf.get_variable(
+                'global_step', [],
+                initializer=tf.constant_initializer(0),
+                dtype=tf.int64,
+                trainable=False)
+            variable_averages = tf.train.ExponentialMovingAverage(
+                0.997, global_step)
+
+            # detector
+            detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
+            all_maps = detector.build_model(
+                self.input_images, is_training=False)
+
+            # decode local predictions
+            all_nodes, all_links, all_reg = [], [], []
+            for i, maps in enumerate(all_maps):
+                cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
+                reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
+
+                cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
+
+                lnk_prob_pos = tf.nn.softmax(
+                    tf.reshape(lnk_maps, [-1, 4])[:, :2])
+                lnk_prob_mut = tf.nn.softmax(
+                    tf.reshape(lnk_maps, [-1, 4])[:, 2:])
+                lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
+
+                all_nodes.append(cls_prob)
+                all_links.append(lnk_prob)
+                all_reg.append(reg_maps)
+
+            # decode segments and links
+            image_size = tf.shape(self.input_images)[1:3]
+            segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
+                image_size,
+                all_nodes,
+                all_links,
+                all_reg,
+                anchor_sizes=list(detector.anchor_sizes))
+
+            # combine segments
+            combined_rboxes, combined_counts = ops.combine_segments_python(
+                segments, group_indices, segment_counts)
+            self.output['combined_rboxes'] = combined_rboxes
+            self.output['combined_counts'] = combined_counts
 
         with self._session.as_default() as sess:
             logger.info(f'loading model from {model_path}')
diff --git a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
index 50b8ba02..d03ff405 100644
--- a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
@@ -1,8 +1,12 @@
 import tensorflow as tf
-import tf_slim as slim
 
 from . import ops, resnet18_v1, resnet_utils
 
+if tf.__version__ >= '2.0':
+    import tf_slim as slim
+else:
+    from tensorflow.contrib import slim
+
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
index 6371d4e5..7930c5a3 100644
--- a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
@@ -30,10 +30,14 @@ ResNet-101 for semantic segmentation into 21 classes:
                                                 output_stride=16)
 """
 import tensorflow as tf
-import tf_slim as slim
 
 from . import resnet_utils
 
+if tf.__version__ >= '2.0':
+    import tf_slim as slim
+else:
+    from tensorflow.contrib import slim
+
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
 
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
index e0e240c8..0a9af224 100644
--- a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
@@ -19,7 +19,11 @@ implementation is more memory efficient.
 import collections
 
 import tensorflow as tf
-import tf_slim as slim
+
+if tf.__version__ >= '2.0':
+    import tf_slim as slim
+else:
+    from tensorflow.contrib import slim
 
 if tf.__version__ >= '2.0':
     tf = tf.compat.v1
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index 986961b7..d1ecd4e4 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -27,6 +27,11 @@ class OCRDetectionTest(unittest.TestCase):
         print('ocr detection results: ')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id)
+        self.pipeline_inference(ocr_detection, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         ocr_detection = pipeline(Tasks.ocr_detection)

From 664de39b7908fba039edd5adb0b601f7535c70bb Mon Sep 17 00:00:00 2001
From: "yanheng.wyh" <yanheng.wyh@alibaba-inc.com>
Date: Tue, 28 Jun 2022 14:04:40 +0800
Subject: [PATCH 2/9] [to #42322933]animal recognation model         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9126742

    * animal recognation model

* update codes

* delet file

* f

* pre commits

* revise

* fix last comment

* fix comments

* fix precommit

* fix comments

* Merge remote-tracking branch 'origin' into cv/animalRecog

* fix comments
---
 modelscope/metainfo.py                        |   1 +
 .../models/cv/animal_recognition/__init__.py  |   0
 .../models/cv/animal_recognition/resnet.py    | 430 ++++++++++++++++++
 .../models/cv/animal_recognition/splat.py     | 125 +++++
 modelscope/pipelines/cv/__init__.py           |   1 +
 .../pipelines/cv/animal_recog_pipeline.py     | 127 ++++++
 tests/pipelines/test_animal_recognation.py    |  20 +
 7 files changed, 704 insertions(+)
 create mode 100644 modelscope/models/cv/animal_recognition/__init__.py
 create mode 100644 modelscope/models/cv/animal_recognition/resnet.py
 create mode 100644 modelscope/models/cv/animal_recognition/splat.py
 create mode 100644 modelscope/pipelines/cv/animal_recog_pipeline.py
 create mode 100644 tests/pipelines/test_animal_recognation.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 680fe2e8..9fad45e2 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -43,6 +43,7 @@ class Pipelines(object):
     person_image_cartoon = 'unet-person-image-cartoon'
     ocr_detection = 'resnet18-ocr-detection'
     action_recognition = 'TAdaConv_action-recognition'
+    animal_recognation = 'resnet101-animal_recog'
 
     # nlp tasks
     sentence_similarity = 'sentence-similarity'
diff --git a/modelscope/models/cv/animal_recognition/__init__.py b/modelscope/models/cv/animal_recognition/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py
new file mode 100644
index 00000000..1fd4b93e
--- /dev/null
+++ b/modelscope/models/cv/animal_recognition/resnet.py
@@ -0,0 +1,430 @@
+import math
+
+import torch
+import torch.nn as nn
+
+from .splat import SplAtConv2d
+
+__all__ = ['ResNet', 'Bottleneck']
+
+
+class DropBlock2D(object):
+
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class GlobalAvgPool2d(nn.Module):
+
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, inputs):
+        return nn.functional.adaptive_avg_pool2d(inputs,
+                                                 1).view(inputs.size(0), -1)
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 radix=1,
+                 cardinality=1,
+                 bottleneck_width=64,
+                 avd=False,
+                 avd_first=False,
+                 dilation=1,
+                 is_first=False,
+                 rectified_conv=False,
+                 rectify_avg=False,
+                 norm_layer=None,
+                 dropblock_prob=0.0,
+                 last_gamma=False):
+        super(Bottleneck, self).__init__()
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+        self.conv1 = nn.Conv2d(
+            inplanes, group_width, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(group_width)
+        self.dropblock_prob = dropblock_prob
+        self.radix = radix
+        self.avd = avd and (stride > 1 or is_first)
+        self.avd_first = avd_first
+
+        if self.avd:
+            self.avd_layer = nn.AvgPool2d(3, stride, padding=1)
+            stride = 1
+
+        if dropblock_prob > 0.0:
+            self.dropblock1 = DropBlock2D(dropblock_prob, 3)
+            if radix == 1:
+                self.dropblock2 = DropBlock2D(dropblock_prob, 3)
+            self.dropblock3 = DropBlock2D(dropblock_prob, 3)
+
+        if radix >= 1:
+            self.conv2 = SplAtConv2d(
+                group_width,
+                group_width,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=cardinality,
+                bias=False,
+                radix=radix,
+                rectify=rectified_conv,
+                rectify_avg=rectify_avg,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+        elif rectified_conv:
+            from rfconv import RFConv2d
+            self.conv2 = RFConv2d(
+                group_width,
+                group_width,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=cardinality,
+                bias=False,
+                average_mode=rectify_avg)
+            self.bn2 = norm_layer(group_width)
+        else:
+            self.conv2 = nn.Conv2d(
+                group_width,
+                group_width,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=cardinality,
+                bias=False)
+            self.bn2 = norm_layer(group_width)
+
+        self.conv3 = nn.Conv2d(
+            group_width, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(planes * 4)
+
+        if last_gamma:
+            from torch.nn.init import zeros_
+            zeros_(self.bn3.weight)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock1(out)
+        out = self.relu(out)
+
+        if self.avd and self.avd_first:
+            out = self.avd_layer(out)
+
+        out = self.conv2(out)
+        if self.radix == 0:
+            out = self.bn2(out)
+            if self.dropblock_prob > 0.0:
+                out = self.dropblock2(out)
+            out = self.relu(out)
+
+        if self.avd and not self.avd_first:
+            out = self.avd_layer(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.dropblock_prob > 0.0:
+            out = self.dropblock3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 radix=1,
+                 groups=1,
+                 bottleneck_width=64,
+                 num_classes=1000,
+                 dilated=False,
+                 dilation=1,
+                 deep_stem=False,
+                 stem_width=64,
+                 avg_down=False,
+                 rectified_conv=False,
+                 rectify_avg=False,
+                 avd=False,
+                 avd_first=False,
+                 final_drop=0.0,
+                 dropblock_prob=0,
+                 last_gamma=False,
+                 norm_layer=nn.BatchNorm2d):
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width * 2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
+        super(ResNet, self).__init__()
+        self.rectified_conv = rectified_conv
+        self.rectify_avg = rectify_avg
+        if rectified_conv:
+            from rfconv import RFConv2d
+            conv_layer = RFConv2d
+        else:
+            conv_layer = nn.Conv2d
+        conv_kwargs = {'average_mode': rectify_avg} if rectified_conv else {}
+        if deep_stem:
+            self.conv1 = nn.Sequential(
+                conv_layer(
+                    3,
+                    stem_width,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False,
+                    **conv_kwargs),
+                norm_layer(stem_width),
+                nn.ReLU(inplace=True),
+                conv_layer(
+                    stem_width,
+                    stem_width,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                    **conv_kwargs),
+                norm_layer(stem_width),
+                nn.ReLU(inplace=True),
+                conv_layer(
+                    stem_width,
+                    stem_width * 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                    **conv_kwargs),
+            )
+        else:
+            self.conv1 = conv_layer(
+                3,
+                64,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False,
+                **conv_kwargs)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(
+            block, 64, layers[0], norm_layer=norm_layer, is_first=False)
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, norm_layer=norm_layer)
+        if dilated or dilation == 4:
+            self.layer3 = self._make_layer(
+                block,
+                256,
+                layers[2],
+                stride=1,
+                dilation=2,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(
+                block,
+                512,
+                layers[3],
+                stride=1,
+                dilation=4,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+        elif dilation == 2:
+            self.layer3 = self._make_layer(
+                block,
+                256,
+                layers[2],
+                stride=2,
+                dilation=1,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(
+                block,
+                512,
+                layers[3],
+                stride=1,
+                dilation=2,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+        else:
+            self.layer3 = self._make_layer(
+                block,
+                256,
+                layers[2],
+                stride=2,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+            self.layer4 = self._make_layer(
+                block,
+                512,
+                layers[3],
+                stride=2,
+                norm_layer=norm_layer,
+                dropblock_prob=dropblock_prob)
+        self.avgpool = GlobalAvgPool2d()
+        self.drop = nn.Dropout(final_drop) if final_drop > 0.0 else None
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, norm_layer):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    stride=1,
+                    dilation=1,
+                    norm_layer=None,
+                    dropblock_prob=0.0,
+                    is_first=True):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            down_layers = []
+            if self.avg_down:
+                if dilation == 1:
+                    down_layers.append(
+                        nn.AvgPool2d(
+                            kernel_size=stride,
+                            stride=stride,
+                            ceil_mode=True,
+                            count_include_pad=False))
+                else:
+                    down_layers.append(
+                        nn.AvgPool2d(
+                            kernel_size=1,
+                            stride=1,
+                            ceil_mode=True,
+                            count_include_pad=False))
+                down_layers.append(
+                    nn.Conv2d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=1,
+                        bias=False))
+            else:
+                down_layers.append(
+                    nn.Conv2d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False))
+            down_layers.append(norm_layer(planes * block.expansion))
+            downsample = nn.Sequential(*down_layers)
+
+        layers = []
+        if dilation == 1 or dilation == 2:
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    stride,
+                    downsample=downsample,
+                    radix=self.radix,
+                    cardinality=self.cardinality,
+                    bottleneck_width=self.bottleneck_width,
+                    avd=self.avd,
+                    avd_first=self.avd_first,
+                    dilation=1,
+                    is_first=is_first,
+                    rectified_conv=self.rectified_conv,
+                    rectify_avg=self.rectify_avg,
+                    norm_layer=norm_layer,
+                    dropblock_prob=dropblock_prob,
+                    last_gamma=self.last_gamma))
+        elif dilation == 4:
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    stride,
+                    downsample=downsample,
+                    radix=self.radix,
+                    cardinality=self.cardinality,
+                    bottleneck_width=self.bottleneck_width,
+                    avd=self.avd,
+                    avd_first=self.avd_first,
+                    dilation=2,
+                    is_first=is_first,
+                    rectified_conv=self.rectified_conv,
+                    rectify_avg=self.rectify_avg,
+                    norm_layer=norm_layer,
+                    dropblock_prob=dropblock_prob,
+                    last_gamma=self.last_gamma))
+        else:
+            raise RuntimeError('=> unknown dilation size: {}'.format(dilation))
+
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    radix=self.radix,
+                    cardinality=self.cardinality,
+                    bottleneck_width=self.bottleneck_width,
+                    avd=self.avd,
+                    avd_first=self.avd_first,
+                    dilation=dilation,
+                    rectified_conv=self.rectified_conv,
+                    rectify_avg=self.rectify_avg,
+                    norm_layer=norm_layer,
+                    dropblock_prob=dropblock_prob,
+                    last_gamma=self.last_gamma))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        if self.drop:
+            x = self.drop(x)
+        x = self.fc(x)
+
+        return x
diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py
new file mode 100644
index 00000000..b12bf154
--- /dev/null
+++ b/modelscope/models/cv/animal_recognition/splat.py
@@ -0,0 +1,125 @@
+"""Split-Attention"""
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import BatchNorm2d, Conv2d, Linear, Module, ReLU
+from torch.nn.modules.utils import _pair
+
+__all__ = ['SplAtConv2d']
+
+
+class SplAtConv2d(Module):
+    """Split-Attention Conv2d
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 dilation=(1, 1),
+                 groups=1,
+                 bias=True,
+                 radix=2,
+                 reduction_factor=4,
+                 rectify=False,
+                 rectify_avg=False,
+                 norm_layer=None,
+                 dropblock_prob=0.0,
+                 **kwargs):
+        super(SplAtConv2d, self).__init__()
+        padding = _pair(padding)
+        self.rectify = rectify and (padding[0] > 0 or padding[1] > 0)
+        self.rectify_avg = rectify_avg
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.cardinality = groups
+        self.channels = channels
+        self.dropblock_prob = dropblock_prob
+        if self.rectify:
+            from rfconv import RFConv2d
+            self.conv = RFConv2d(
+                in_channels,
+                channels * radix,
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                groups=groups * radix,
+                bias=bias,
+                average_mode=rectify_avg,
+                **kwargs)
+        else:
+            self.conv = Conv2d(
+                in_channels,
+                channels * radix,
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                groups=groups * radix,
+                bias=bias,
+                **kwargs)
+        self.use_bn = norm_layer is not None
+        if self.use_bn:
+            self.bn0 = norm_layer(channels * radix)
+        self.relu = ReLU(inplace=True)
+        self.fc1 = Conv2d(channels, inter_channels, 1, groups=self.cardinality)
+        if self.use_bn:
+            self.bn1 = norm_layer(inter_channels)
+        self.fc2 = Conv2d(
+            inter_channels, channels * radix, 1, groups=self.cardinality)
+        if dropblock_prob > 0.0:
+            self.dropblock = DropBlock2D(dropblock_prob, 3)
+        self.rsoftmax = rSoftMax(radix, groups)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn0(x)
+        if self.dropblock_prob > 0.0:
+            x = self.dropblock(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        if self.radix > 1:
+            splited = torch.split(x, rchannel // self.radix, dim=1)
+            gap = sum(splited)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        if self.use_bn:
+            gap = self.bn1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = torch.split(atten, rchannel // self.radix, dim=1)
+            out = sum([att * split for (att, split) in zip(attens, splited)])
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class rSoftMax(nn.Module):
+
+    def __init__(self, radix, cardinality):
+        super().__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 68d875ec..b046e076 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -1,4 +1,5 @@
 from .action_recognition_pipeline import ActionRecognitionPipeline
+from .animal_recog_pipeline import AnimalRecogPipeline
 from .image_cartoon_pipeline import ImageCartoonPipeline
 from .image_matting_pipeline import ImageMattingPipeline
 from .ocr_detection_pipeline import OCRDetectionPipeline
diff --git a/modelscope/pipelines/cv/animal_recog_pipeline.py b/modelscope/pipelines/cv/animal_recog_pipeline.py
new file mode 100644
index 00000000..eee9e844
--- /dev/null
+++ b/modelscope/pipelines/cv/animal_recog_pipeline.py
@@ -0,0 +1,127 @@
+import os.path as osp
+import tempfile
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from modelscope.fileio import File
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.animal_recognition import resnet
+from modelscope.pipelines.base import Input
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from ..base import Pipeline
+from ..builder import PIPELINES
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_classification, module_name=Pipelines.animal_recognation)
+class AnimalRecogPipeline(Pipeline):
+
+    def __init__(self, model: str):
+        super().__init__(model=model)
+        import torch
+
+        def resnest101(**kwargs):
+            model = resnet.ResNet(
+                resnet.Bottleneck, [3, 4, 23, 3],
+                radix=2,
+                groups=1,
+                bottleneck_width=64,
+                deep_stem=True,
+                stem_width=64,
+                avg_down=True,
+                avd=True,
+                avd_first=False,
+                **kwargs)
+            return model
+
+        def filter_param(src_params, own_state):
+            copied_keys = []
+            for name, param in src_params.items():
+                if 'module.' == name[0:7]:
+                    name = name[7:]
+                if '.module.' not in list(own_state.keys())[0]:
+                    name = name.replace('.module.', '.')
+                if (name in own_state) and (own_state[name].shape
+                                            == param.shape):
+                    own_state[name].copy_(param)
+                    copied_keys.append(name)
+
+        def load_pretrained(model, src_params):
+            if 'state_dict' in src_params:
+                src_params = src_params['state_dict']
+            own_state = model.state_dict()
+            filter_param(src_params, own_state)
+            model.load_state_dict(own_state)
+
+        self.model = resnest101(num_classes=8288)
+        local_model_dir = model
+        if osp.exists(model):
+            local_model_dir = model
+        else:
+            local_model_dir = snapshot_download(model)
+        self.local_path = local_model_dir
+        src_params = torch.load(
+            osp.join(local_model_dir, 'pytorch_model.pt'), 'cpu')
+        load_pretrained(self.model, src_params)
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            img = load_image(input)
+        elif isinstance(input, PIL.Image.Image):
+            img = input.convert('RGB')
+        elif isinstance(input, np.ndarray):
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+            img = input[:, :, ::-1]
+            img = Image.fromarray(img.astype('uint8')).convert('RGB')
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+
+        normalize = transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        test_transforms = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(), normalize
+        ])
+        img = test_transforms(img)
+        result = {'img': img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        def set_phase(model, is_train):
+            if is_train:
+                model.train()
+            else:
+                model.eval()
+
+        is_train = False
+        set_phase(self.model, is_train)
+        img = input['img']
+        input_img = torch.unsqueeze(img, 0)
+        outputs = self.model(input_img)
+        return {'outputs': outputs}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        label_mapping_path = osp.join(self.local_path, 'label_mapping.txt')
+        with open(label_mapping_path, 'r') as f:
+            label_mapping = f.readlines()
+        score = torch.max(inputs['outputs'])
+        inputs = {
+            'scores': score.item(),
+            'labels': label_mapping[inputs['outputs'].argmax()].split('\t')[1]
+        }
+        return inputs
diff --git a/tests/pipelines/test_animal_recognation.py b/tests/pipelines/test_animal_recognation.py
new file mode 100644
index 00000000..d0f42dc3
--- /dev/null
+++ b/tests/pipelines/test_animal_recognation.py
@@ -0,0 +1,20 @@
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class MultiModalFeatureTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run(self):
+        animal_recog = pipeline(
+            Tasks.image_classification,
+            model='damo/cv_resnest101_animal_recognation')
+        result = animal_recog('data/test/images/image1.jpg')
+        print(result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From a7c1cd0fc92ee0a3058cec8f4ccde1c0f641e982 Mon Sep 17 00:00:00 2001
From: "suluyan.sly" <suluyan.sly@alibaba-inc.com>
Date: Tue, 28 Jun 2022 14:34:16 +0800
Subject: [PATCH 3/9] [to #42322933]feat: add
 nlp-chinese-bert-fill-mask-pipeline to maas_lib         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9155437

---
 .../models/nlp/masked_language_model.py       | 48 ++++++++++++-------
 .../pipelines/nlp/fill_mask_pipeline.py       | 36 ++++++++------
 modelscope/preprocessors/nlp.py               | 11 +++--
 tests/pipelines/test_fill_mask.py             | 36 +++++++++++++-
 4 files changed, 94 insertions(+), 37 deletions(-)

diff --git a/modelscope/models/nlp/masked_language_model.py b/modelscope/models/nlp/masked_language_model.py
index fd5f97e6..a760822b 100644
--- a/modelscope/models/nlp/masked_language_model.py
+++ b/modelscope/models/nlp/masked_language_model.py
@@ -2,24 +2,28 @@ from typing import Any, Dict, Optional, Union
 
 import numpy as np
 
-from modelscope.metainfo import Models
-from modelscope.utils.constant import Tasks
+from ...metainfo import Models
+from ...utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
 
-__all__ = ['StructBertForMaskedLM', 'VecoForMaskedLM']
+__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']
 
 
-class AliceMindBaseForMaskedLM(Model):
+class MaskedLanguageModelBase(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
-        from sofa.utils.backend import AutoConfig, AutoModelForMaskedLM
-        self.model_dir = model_dir
         super().__init__(model_dir, *args, **kwargs)
+        self.model = self.build_model()
 
-        self.config = AutoConfig.from_pretrained(model_dir)
-        self.model = AutoModelForMaskedLM.from_pretrained(
-            model_dir, config=self.config)
+    def build_model():
+        raise NotImplementedError()
+
+    @property
+    def config(self):
+        if hasattr(self.model, 'config'):
+            return self.model.config
+        return None
 
     def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
         """return the result by the model
@@ -38,14 +42,24 @@ class AliceMindBaseForMaskedLM(Model):
 
 
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
-class StructBertForMaskedLM(AliceMindBaseForMaskedLM):
-    # The StructBert for MaskedLM uses the same underlying model structure
-    # as the base model class.
-    pass
+class StructBertForMaskedLM(MaskedLanguageModelBase):
+
+    def build_model(self):
+        from sofa import SbertForMaskedLM
+        return SbertForMaskedLM.from_pretrained(self.model_dir)
 
 
 @MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
-class VecoForMaskedLM(AliceMindBaseForMaskedLM):
-    # The Veco for MaskedLM uses the same underlying model structure
-    # as the base model class.
-    pass
+class VecoForMaskedLM(MaskedLanguageModelBase):
+
+    def build_model(self):
+        from sofa import VecoForMaskedLM
+        return VecoForMaskedLM.from_pretrained(self.model_dir)
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
+class BertForMaskedLM(MaskedLanguageModelBase):
+
+    def build_model(self):
+        from transformers import BertForMaskedLM
+        return BertForMaskedLM.from_pretrained(self.model_dir)
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
index 863d9a6d..1567ef9d 100644
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -1,32 +1,34 @@
+import os
 from typing import Dict, Optional, Union
 
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.models.nlp.masked_language_model import \
-    AliceMindBaseForMaskedLM
-from modelscope.preprocessors import FillMaskPreprocessor
-from modelscope.utils.constant import Tasks
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp.masked_language_model import MaskedLanguageModelBase
+from ...preprocessors import FillMaskPreprocessor
+from ...utils.config import Config
+from ...utils.constant import ModelFile, Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 
 __all__ = ['FillMaskPipeline']
+_type_map = {'veco': 'roberta', 'sbert': 'bert'}
 
 
 @PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
 class FillMaskPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[AliceMindBaseForMaskedLM, str],
+                 model: Union[MaskedLanguageModelBase, str],
                  preprocessor: Optional[FillMaskPreprocessor] = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
 
         Args:
-            model (AliceMindBaseForMaskedLM): a model instance
+            model (MaskedLanguageModelBase): a model instance
             preprocessor (FillMaskPreprocessor): a preprocessor instance
         """
         fill_mask_model = model if isinstance(
-            model, AliceMindBaseForMaskedLM) else Model.from_pretrained(model)
+            model, MaskedLanguageModelBase) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = FillMaskPreprocessor(
                 fill_mask_model.model_dir,
@@ -34,11 +36,13 @@ class FillMaskPipeline(Pipeline):
                 second_sequence=None)
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.preprocessor = preprocessor
+        self.config = Config.from_file(
+            os.path.join(fill_mask_model.model_dir, ModelFile.CONFIGURATION))
         self.tokenizer = preprocessor.tokenizer
-        self.mask_id = {'veco': 250001, 'sbert': 103}
+        self.mask_id = {'roberta': 250001, 'bert': 103}
 
         self.rep_map = {
-            'sbert': {
+            'bert': {
                 '[unused0]': '',
                 '[PAD]': '',
                 '[unused1]': '',
@@ -48,7 +52,7 @@ class FillMaskPipeline(Pipeline):
                 '[CLS]': '',
                 '[UNK]': ''
             },
-            'veco': {
+            'roberta': {
                 r' +': ' ',
                 '<mask>': '<q>',
                 '<pad>': '',
@@ -72,7 +76,9 @@ class FillMaskPipeline(Pipeline):
         input_ids = inputs['input_ids'].detach().numpy()
         pred_ids = np.argmax(logits, axis=-1)
         model_type = self.model.config.model_type
-        rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids,
+        process_type = model_type if model_type in self.mask_id else _type_map[
+            model_type]
+        rst_ids = np.where(input_ids == self.mask_id[process_type], pred_ids,
                            input_ids)
 
         def rep_tokens(string, rep_map):
@@ -82,12 +88,12 @@ class FillMaskPipeline(Pipeline):
 
         pred_strings = []
         for ids in rst_ids:  # batch
-            if self.model.config.vocab_size == 21128:  # zh bert
+            if 'language' in self.config.model and self.config.model.language == 'zh':
                 pred_string = self.tokenizer.convert_ids_to_tokens(ids)
                 pred_string = ''.join(pred_string)
             else:
                 pred_string = self.tokenizer.decode(ids)
-            pred_string = rep_tokens(pred_string, self.rep_map[model_type])
+            pred_string = rep_tokens(pred_string, self.rep_map[process_type])
             pred_strings.append(pred_string)
 
         return {'text': pred_strings}
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 3f98a081..4ed63f3c 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -192,14 +192,17 @@ class FillMaskPreprocessor(Preprocessor):
             model_dir (str): model path
         """
         super().__init__(*args, **kwargs)
-        from sofa.utils.backend import AutoTokenizer
         self.model_dir = model_dir
         self.first_sequence: str = kwargs.pop('first_sequence',
                                               'first_sequence')
         self.sequence_length = kwargs.pop('sequence_length', 128)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_dir, use_fast=False)
+        try:
+            from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
+        except KeyError:
+            from sofa.utils.backend import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_dir, use_fast=False)
 
     @type_assert(object, str)
     def __call__(self, data: str) -> Dict[str, Any]:
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index 49c5dc8a..d44ba4c8 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -3,7 +3,8 @@ import unittest
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM
+from modelscope.models.nlp import (BertForMaskedLM, StructBertForMaskedLM,
+                                   VecoForMaskedLM)
 from modelscope.pipelines import FillMaskPipeline, pipeline
 from modelscope.preprocessors import FillMaskPreprocessor
 from modelscope.utils.constant import Tasks
@@ -16,6 +17,7 @@ class FillMaskTest(unittest.TestCase):
         'en': 'damo/nlp_structbert_fill-mask_english-large'
     }
     model_id_veco = 'damo/nlp_veco_fill-mask-large'
+    model_id_bert = 'damo/nlp_bert_fill-mask_chinese-base'
 
     ori_texts = {
         'zh':
@@ -69,6 +71,20 @@ class FillMaskTest(unittest.TestCase):
                 f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
             )
 
+        # zh bert
+        language = 'zh'
+        model_dir = snapshot_download(self.model_id_bert)
+        preprocessor = FillMaskPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = BertForMaskedLM(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        ori_text = self.ori_texts[language]
+        test_input = self.test_inputs[language]
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+              f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         # sbert
@@ -97,6 +113,18 @@ class FillMaskTest(unittest.TestCase):
             print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
                   f'{pipeline_ins(test_input)}\n')
 
+        # zh bert
+        model = Model.from_pretrained(self.model_id_bert)
+        preprocessor = FillMaskPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        language = 'zh'
+        ori_text = self.ori_texts[language]
+        test_input = self.test_inputs[language]
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+              f'{pipeline_ins(test_input)}\n')
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name(self):
         # veco
@@ -115,6 +143,12 @@ class FillMaskTest(unittest.TestCase):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
+        # bert
+        pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_bert)
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.fill_mask)

From 04b7eba285dae7026a08b0136ccea7ba31319f6b Mon Sep 17 00:00:00 2001
From: "bin.xue" <bin.xue@alibaba-inc.com>
Date: Tue, 28 Jun 2022 14:41:08 +0800
Subject: [PATCH 4/9] [to #42322933] Merge ANS pipeline into master        
 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9178339

    * refactor: move aec models to audio/aec

* refactor: move aec models to audio/aec

* refactor: move aec models to audio/aec

* refactor: move aec models to audio/aec

* feat: add unittest for ANS pipeline

* Merge branch 'master' into dev/ans

* add new SoundFile to audio dependency

* Merge branch 'master' into dev/ans

* use ANS pipeline name from metainfo

* Merge branch 'master' into dev/ans

* chore: update docstring of ANS module

* Merge branch 'master' into dev/ans

* refactor: use names from metainfo

* refactor: enable ans unittest

* refactor: add more log message in unittest
---
 modelscope/metainfo.py                        |   2 +
 modelscope/models/__init__.py                 |   1 +
 .../models/audio/{layers => aec}/__init__.py  |   0
 .../audio/{network => aec/layers}/__init__.py |   0
 .../audio/{ => aec}/layers/activations.py     |   0
 .../{ => aec}/layers/affine_transform.py      |   0
 .../audio/{ => aec}/layers/deep_fsmn.py       |   0
 .../audio/{ => aec}/layers/layer_base.py      |   0
 .../audio/{ => aec}/layers/uni_deep_fsmn.py   |   0
 .../models/audio/aec/network/__init__.py      |   0
 .../models/audio/{ => aec}/network/loss.py    |   0
 .../{ => aec}/network/modulation_loss.py      |   0
 .../models/audio/{ => aec}/network/se_net.py  |   0
 modelscope/models/audio/ans/__init__.py       |   0
 modelscope/models/audio/ans/complex_nn.py     | 248 ++++++++++++++
 modelscope/models/audio/ans/conv_stft.py      | 112 +++++++
 modelscope/models/audio/ans/frcrn.py          | 309 ++++++++++++++++++
 .../models/audio/ans/se_module_complex.py     |  26 ++
 modelscope/models/audio/ans/unet.py           | 269 +++++++++++++++
 modelscope/pipelines/__init__.py              |   1 +
 modelscope/pipelines/audio/ans_pipeline.py    | 117 +++++++
 requirements/audio.txt                        |   1 +
 tests/pipelines/test_speech_signal_process.py |  32 +-
 23 files changed, 1112 insertions(+), 6 deletions(-)
 rename modelscope/models/audio/{layers => aec}/__init__.py (100%)
 rename modelscope/models/audio/{network => aec/layers}/__init__.py (100%)
 rename modelscope/models/audio/{ => aec}/layers/activations.py (100%)
 rename modelscope/models/audio/{ => aec}/layers/affine_transform.py (100%)
 rename modelscope/models/audio/{ => aec}/layers/deep_fsmn.py (100%)
 rename modelscope/models/audio/{ => aec}/layers/layer_base.py (100%)
 rename modelscope/models/audio/{ => aec}/layers/uni_deep_fsmn.py (100%)
 create mode 100644 modelscope/models/audio/aec/network/__init__.py
 rename modelscope/models/audio/{ => aec}/network/loss.py (100%)
 rename modelscope/models/audio/{ => aec}/network/modulation_loss.py (100%)
 rename modelscope/models/audio/{ => aec}/network/se_net.py (100%)
 create mode 100644 modelscope/models/audio/ans/__init__.py
 create mode 100644 modelscope/models/audio/ans/complex_nn.py
 create mode 100644 modelscope/models/audio/ans/conv_stft.py
 create mode 100644 modelscope/models/audio/ans/frcrn.py
 create mode 100644 modelscope/models/audio/ans/se_module_complex.py
 create mode 100644 modelscope/models/audio/ans/unet.py
 create mode 100644 modelscope/pipelines/audio/ans_pipeline.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 9fad45e2..eda590ac 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -21,6 +21,7 @@ class Models(object):
     sambert_hifi_16k = 'sambert-hifi-16k'
     generic_tts_frontend = 'generic-tts-frontend'
     hifigan16k = 'hifigan16k'
+    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
     kws_kwsbp = 'kws-kwsbp'
 
     # multi-modal models
@@ -55,6 +56,7 @@ class Pipelines(object):
     # audio tasks
     sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
     speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
+    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
     kws_kwsbp = 'kws-kwsbp'
 
     # multi-modal tasks
diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py
index ebf81c32..816c44e2 100644
--- a/modelscope/models/__init__.py
+++ b/modelscope/models/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from .audio.ans.frcrn import FRCRNModel
 from .audio.kws import GenericKeyWordSpotting
 from .audio.tts.am import SambertNetHifi16k
 from .audio.tts.vocoder import Hifigan16k
diff --git a/modelscope/models/audio/layers/__init__.py b/modelscope/models/audio/aec/__init__.py
similarity index 100%
rename from modelscope/models/audio/layers/__init__.py
rename to modelscope/models/audio/aec/__init__.py
diff --git a/modelscope/models/audio/network/__init__.py b/modelscope/models/audio/aec/layers/__init__.py
similarity index 100%
rename from modelscope/models/audio/network/__init__.py
rename to modelscope/models/audio/aec/layers/__init__.py
diff --git a/modelscope/models/audio/layers/activations.py b/modelscope/models/audio/aec/layers/activations.py
similarity index 100%
rename from modelscope/models/audio/layers/activations.py
rename to modelscope/models/audio/aec/layers/activations.py
diff --git a/modelscope/models/audio/layers/affine_transform.py b/modelscope/models/audio/aec/layers/affine_transform.py
similarity index 100%
rename from modelscope/models/audio/layers/affine_transform.py
rename to modelscope/models/audio/aec/layers/affine_transform.py
diff --git a/modelscope/models/audio/layers/deep_fsmn.py b/modelscope/models/audio/aec/layers/deep_fsmn.py
similarity index 100%
rename from modelscope/models/audio/layers/deep_fsmn.py
rename to modelscope/models/audio/aec/layers/deep_fsmn.py
diff --git a/modelscope/models/audio/layers/layer_base.py b/modelscope/models/audio/aec/layers/layer_base.py
similarity index 100%
rename from modelscope/models/audio/layers/layer_base.py
rename to modelscope/models/audio/aec/layers/layer_base.py
diff --git a/modelscope/models/audio/layers/uni_deep_fsmn.py b/modelscope/models/audio/aec/layers/uni_deep_fsmn.py
similarity index 100%
rename from modelscope/models/audio/layers/uni_deep_fsmn.py
rename to modelscope/models/audio/aec/layers/uni_deep_fsmn.py
diff --git a/modelscope/models/audio/aec/network/__init__.py b/modelscope/models/audio/aec/network/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/network/loss.py b/modelscope/models/audio/aec/network/loss.py
similarity index 100%
rename from modelscope/models/audio/network/loss.py
rename to modelscope/models/audio/aec/network/loss.py
diff --git a/modelscope/models/audio/network/modulation_loss.py b/modelscope/models/audio/aec/network/modulation_loss.py
similarity index 100%
rename from modelscope/models/audio/network/modulation_loss.py
rename to modelscope/models/audio/aec/network/modulation_loss.py
diff --git a/modelscope/models/audio/network/se_net.py b/modelscope/models/audio/aec/network/se_net.py
similarity index 100%
rename from modelscope/models/audio/network/se_net.py
rename to modelscope/models/audio/aec/network/se_net.py
diff --git a/modelscope/models/audio/ans/__init__.py b/modelscope/models/audio/ans/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/ans/complex_nn.py b/modelscope/models/audio/ans/complex_nn.py
new file mode 100644
index 00000000..69dec41e
--- /dev/null
+++ b/modelscope/models/audio/ans/complex_nn.py
@@ -0,0 +1,248 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class UniDeepFsmn(nn.Module):
+
+    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
+        super(UniDeepFsmn, self).__init__()
+
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+
+        self.linear = nn.Linear(input_dim, hidden_size)
+
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)
+
+        self.conv1 = nn.Conv2d(
+            output_dim,
+            output_dim, [lorder, 1], [1, 1],
+            groups=output_dim,
+            bias=False)
+
+    def forward(self, input):
+        r"""
+
+        Args:
+            input: torch with shape: batch (b) x sequence(T) x feature (h)
+
+        Returns:
+            batch (b) x channel (c) x sequence(T) x feature (h)
+        """
+        f1 = F.relu(self.linear(input))
+
+        p1 = self.project(f1)
+
+        x = torch.unsqueeze(p1, 1)
+        # x: batch (b) x channel (c) x sequence(T) x feature (h)
+        x_per = x.permute(0, 3, 2, 1)
+        # x_per: batch (b) x feature (h) x sequence(T) x channel (c)
+        y = F.pad(x_per, [0, 0, self.lorder - 1, 0])
+
+        out = x_per + self.conv1(y)
+
+        out1 = out.permute(0, 3, 2, 1)
+        # out1: batch (b) x channel (c) x sequence(T) x feature (h)
+        return input + out1.squeeze()
+
+
+class ComplexUniDeepFsmn(nn.Module):
+
+    def __init__(self, nIn, nHidden=128, nOut=128):
+        super(ComplexUniDeepFsmn, self).__init__()
+
+        self.fsmn_re_L1 = UniDeepFsmn(nIn, nHidden, 20, nHidden)
+        self.fsmn_im_L1 = UniDeepFsmn(nIn, nHidden, 20, nHidden)
+        self.fsmn_re_L2 = UniDeepFsmn(nHidden, nOut, 20, nHidden)
+        self.fsmn_im_L2 = UniDeepFsmn(nHidden, nOut, 20, nHidden)
+
+    def forward(self, x):
+        r"""
+
+        Args:
+            x: torch with shape [batch, channel, feature, sequence, 2], eg: [6, 256, 1, 106, 2]
+
+        Returns:
+            [batch, feature, sequence, 2], eg: [6, 99, 1024, 2]
+        """
+        #
+        b, c, h, T, d = x.size()
+        x = torch.reshape(x, (b, c * h, T, d))
+        # x: [b,h,T,2], [6, 256, 106, 2]
+        x = torch.transpose(x, 1, 2)
+        # x: [b,T,h,2], [6, 106, 256, 2]
+
+        real_L1 = self.fsmn_re_L1(x[..., 0]) - self.fsmn_im_L1(x[..., 1])
+        imaginary_L1 = self.fsmn_re_L1(x[..., 1]) + self.fsmn_im_L1(x[..., 0])
+        # GRU output: [99, 6, 128]
+        real = self.fsmn_re_L2(real_L1) - self.fsmn_im_L2(imaginary_L1)
+        imaginary = self.fsmn_re_L2(imaginary_L1) + self.fsmn_im_L2(real_L1)
+        # output: [b,T,h,2], [99, 6, 1024, 2]
+        output = torch.stack((real, imaginary), dim=-1)
+
+        # output: [b,h,T,2], [6, 99, 1024, 2]
+        output = torch.transpose(output, 1, 2)
+        output = torch.reshape(output, (b, c, h, T, d))
+
+        return output
+
+
+class ComplexUniDeepFsmn_L1(nn.Module):
+
+    def __init__(self, nIn, nHidden=128, nOut=128):
+        super(ComplexUniDeepFsmn_L1, self).__init__()
+        self.fsmn_re_L1 = UniDeepFsmn(nIn, nHidden, 20, nHidden)
+        self.fsmn_im_L1 = UniDeepFsmn(nIn, nHidden, 20, nHidden)
+
+    def forward(self, x):
+        r"""
+
+        Args:
+            x: torch with shape [batch, channel, feature, sequence, 2], eg: [6, 256, 1, 106, 2]
+        """
+        b, c, h, T, d = x.size()
+        # x : [b,T,h,c,2]
+        x = torch.transpose(x, 1, 3)
+        x = torch.reshape(x, (b * T, h, c, d))
+
+        real = self.fsmn_re_L1(x[..., 0]) - self.fsmn_im_L1(x[..., 1])
+        imaginary = self.fsmn_re_L1(x[..., 1]) + self.fsmn_im_L1(x[..., 0])
+        # output: [b*T,h,c,2], [6*106, h, 256, 2]
+        output = torch.stack((real, imaginary), dim=-1)
+
+        output = torch.reshape(output, (b, T, h, c, d))
+        output = torch.transpose(output, 1, 3)
+        return output
+
+
+class ComplexConv2d(nn.Module):
+    # https://github.com/litcoderr/ComplexCNN/blob/master/complexcnn/modules.py
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 **kwargs):
+        super().__init__()
+
+        # Model components
+        self.conv_re = nn.Conv2d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            **kwargs)
+        self.conv_im = nn.Conv2d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            **kwargs)
+
+    def forward(self, x):
+        r"""
+
+        Args:
+            x: torch with shape: [batch,channel,axis1,axis2,2]
+        """
+        real = self.conv_re(x[..., 0]) - self.conv_im(x[..., 1])
+        imaginary = self.conv_re(x[..., 1]) + self.conv_im(x[..., 0])
+        output = torch.stack((real, imaginary), dim=-1)
+        return output
+
+
+class ComplexConvTranspose2d(nn.Module):
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 output_padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 **kwargs):
+        super().__init__()
+
+        # Model components
+        self.tconv_re = nn.ConvTranspose2d(
+            in_channel,
+            out_channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+            **kwargs)
+        self.tconv_im = nn.ConvTranspose2d(
+            in_channel,
+            out_channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+            **kwargs)
+
+    def forward(self, x):  # shpae of x : [batch,channel,axis1,axis2,2]
+        real = self.tconv_re(x[..., 0]) - self.tconv_im(x[..., 1])
+        imaginary = self.tconv_re(x[..., 1]) + self.tconv_im(x[..., 0])
+        output = torch.stack((real, imaginary), dim=-1)
+        return output
+
+
+class ComplexBatchNorm2d(nn.Module):
+
+    def __init__(self,
+                 num_features,
+                 eps=1e-5,
+                 momentum=0.1,
+                 affine=True,
+                 track_running_stats=True,
+                 **kwargs):
+        super().__init__()
+        self.bn_re = nn.BatchNorm2d(
+            num_features=num_features,
+            momentum=momentum,
+            affine=affine,
+            eps=eps,
+            track_running_stats=track_running_stats,
+            **kwargs)
+        self.bn_im = nn.BatchNorm2d(
+            num_features=num_features,
+            momentum=momentum,
+            affine=affine,
+            eps=eps,
+            track_running_stats=track_running_stats,
+            **kwargs)
+
+    def forward(self, x):
+        real = self.bn_re(x[..., 0])
+        imag = self.bn_im(x[..., 1])
+        output = torch.stack((real, imag), dim=-1)
+        return output
diff --git a/modelscope/models/audio/ans/conv_stft.py b/modelscope/models/audio/ans/conv_stft.py
new file mode 100644
index 00000000..a47d7817
--- /dev/null
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -0,0 +1,112 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.signal import get_window
+
+
+def init_kernels(win_len, win_inc, fft_len, win_type=None, invers=False):
+    if win_type == 'None' or win_type is None:
+        window = np.ones(win_len)
+    else:
+        window = get_window(win_type, win_len, fftbins=True)**0.5
+
+    N = fft_len
+    fourier_basis = np.fft.rfft(np.eye(N))[:win_len]
+    real_kernel = np.real(fourier_basis)
+    imag_kernel = np.imag(fourier_basis)
+    kernel = np.concatenate([real_kernel, imag_kernel], 1).T
+
+    if invers:
+        kernel = np.linalg.pinv(kernel).T
+
+    kernel = kernel * window
+    kernel = kernel[:, None, :]
+    return torch.from_numpy(kernel.astype(np.float32)), torch.from_numpy(
+        window[None, :, None].astype(np.float32))
+
+
+class ConvSTFT(nn.Module):
+
+    def __init__(self,
+                 win_len,
+                 win_inc,
+                 fft_len=None,
+                 win_type='hamming',
+                 feature_type='real',
+                 fix=True):
+        super(ConvSTFT, self).__init__()
+
+        if fft_len is None:
+            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
+        else:
+            self.fft_len = fft_len
+
+        kernel, _ = init_kernels(win_len, win_inc, self.fft_len, win_type)
+        self.weight = nn.Parameter(kernel, requires_grad=(not fix))
+        self.feature_type = feature_type
+        self.stride = win_inc
+        self.win_len = win_len
+        self.dim = self.fft_len
+
+    def forward(self, inputs):
+        if inputs.dim() == 2:
+            inputs = torch.unsqueeze(inputs, 1)
+
+        outputs = F.conv1d(inputs, self.weight, stride=self.stride)
+
+        if self.feature_type == 'complex':
+            return outputs
+        else:
+            dim = self.dim // 2 + 1
+            real = outputs[:, :dim, :]
+            imag = outputs[:, dim:, :]
+            mags = torch.sqrt(real**2 + imag**2)
+            phase = torch.atan2(imag, real)
+            return mags, phase
+
+
+class ConviSTFT(nn.Module):
+
+    def __init__(self,
+                 win_len,
+                 win_inc,
+                 fft_len=None,
+                 win_type='hamming',
+                 feature_type='real',
+                 fix=True):
+        super(ConviSTFT, self).__init__()
+        if fft_len is None:
+            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
+        else:
+            self.fft_len = fft_len
+        kernel, window = init_kernels(
+            win_len, win_inc, self.fft_len, win_type, invers=True)
+        self.weight = nn.Parameter(kernel, requires_grad=(not fix))
+        self.feature_type = feature_type
+        self.win_type = win_type
+        self.win_len = win_len
+        self.win_inc = win_inc
+        self.stride = win_inc
+        self.dim = self.fft_len
+        self.register_buffer('window', window)
+        self.register_buffer('enframe', torch.eye(win_len)[:, None, :])
+
+    def forward(self, inputs, phase=None):
+        """
+        Args:
+            inputs : [B, N+2, T] (complex spec) or [B, N//2+1, T] (mags)
+            phase: [B, N//2+1, T] (if not none)
+        """
+
+        if phase is not None:
+            real = inputs * torch.cos(phase)
+            imag = inputs * torch.sin(phase)
+            inputs = torch.cat([real, imag], 1)
+        outputs = F.conv_transpose1d(inputs, self.weight, stride=self.stride)
+
+        # this is from torch-stft: https://github.com/pseeth/torch-stft
+        t = self.window.repeat(1, 1, inputs.size(-1))**2
+        coff = F.conv_transpose1d(t, self.enframe, stride=self.stride)
+        outputs = outputs / (coff + 1e-8)
+        return outputs
diff --git a/modelscope/models/audio/ans/frcrn.py b/modelscope/models/audio/ans/frcrn.py
new file mode 100644
index 00000000..c56b8773
--- /dev/null
+++ b/modelscope/models/audio/ans/frcrn.py
@@ -0,0 +1,309 @@
+import os
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from ...base import Model, Tensor
+from .conv_stft import ConviSTFT, ConvSTFT
+from .unet import UNet
+
+
+class FTB(nn.Module):
+
+    def __init__(self, input_dim=257, in_channel=9, r_channel=5):
+
+        super(FTB, self).__init__()
+        self.in_channel = in_channel
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
+            nn.BatchNorm2d(r_channel), nn.ReLU())
+
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(
+                r_channel * input_dim, in_channel, kernel_size=9, padding=4),
+            nn.BatchNorm1d(in_channel), nn.ReLU())
+        self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)
+
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
+            nn.BatchNorm2d(in_channel), nn.ReLU())
+
+    def forward(self, inputs):
+        '''
+        inputs should be [Batch, Ca, Dim, Time]
+        '''
+        # T-F attention
+        conv1_out = self.conv1(inputs)
+        B, C, D, T = conv1_out.size()
+        reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
+        conv1d_out = self.conv1d(reshape1_out)
+        conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])
+
+        # now is also [B,C,D,T]
+        att_out = conv1d_out * inputs
+
+        # tranpose to [B,C,T,D]
+        att_out = torch.transpose(att_out, 2, 3)
+        freqfc_out = self.freq_fc(att_out)
+        att_out = torch.transpose(freqfc_out, 2, 3)
+
+        cat_out = torch.cat([att_out, inputs], 1)
+        outputs = self.conv2(cat_out)
+        return outputs
+
+
+@MODELS.register_module(
+    Tasks.speech_signal_process, module_name=Models.speech_frcrn_ans_cirm_16k)
+class FRCRNModel(Model):
+    r""" A decorator of FRCRN for integrating into modelscope framework """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the frcrn model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self._model = FRCRN(*args, **kwargs)
+        model_bin_file = os.path.join(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE)
+        if os.path.exists(model_bin_file):
+            checkpoint = torch.load(model_bin_file)
+            self._model.load_state_dict(checkpoint, strict=False)
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        output = self._model.forward(input)
+        return {
+            'spec_l1': output[0],
+            'wav_l1': output[1],
+            'mask_l1': output[2],
+            'spec_l2': output[3],
+            'wav_l2': output[4],
+            'mask_l2': output[5]
+        }
+
+    def to(self, *args, **kwargs):
+        self._model = self._model.to(*args, **kwargs)
+        return self
+
+    def eval(self):
+        self._model = self._model.train(False)
+        return self
+
+
+class FRCRN(nn.Module):
+    r""" Frequency Recurrent CRN """
+
+    def __init__(self,
+                 complex,
+                 model_complexity,
+                 model_depth,
+                 log_amp,
+                 padding_mode,
+                 win_len=400,
+                 win_inc=100,
+                 fft_len=512,
+                 win_type='hanning'):
+        r"""
+        Args:
+            complex: Whether to use complex networks.
+            model_complexity: define the model complexity with the number of layers
+            model_depth: Only two options are available : 10, 20
+            log_amp: Whether to use log amplitude to estimate signals
+            padding_mode: Encoder's convolution filter. 'zeros', 'reflect'
+            win_len: length of window used for defining one frame of sample points
+            win_inc: length of window shifting (equivalent to hop_size)
+            fft_len: number of Short Time Fourier Transform (STFT) points
+            win_type: windowing type used in STFT, eg. 'hanning', 'hamming'
+        """
+        super().__init__()
+        self.feat_dim = fft_len // 2 + 1
+
+        self.win_len = win_len
+        self.win_inc = win_inc
+        self.fft_len = fft_len
+        self.win_type = win_type
+
+        fix = True
+        self.stft = ConvSTFT(
+            self.win_len,
+            self.win_inc,
+            self.fft_len,
+            self.win_type,
+            feature_type='complex',
+            fix=fix)
+        self.istft = ConviSTFT(
+            self.win_len,
+            self.win_inc,
+            self.fft_len,
+            self.win_type,
+            feature_type='complex',
+            fix=fix)
+        self.unet = UNet(
+            1,
+            complex=complex,
+            model_complexity=model_complexity,
+            model_depth=model_depth,
+            padding_mode=padding_mode)
+        self.unet2 = UNet(
+            1,
+            complex=complex,
+            model_complexity=model_complexity,
+            model_depth=model_depth,
+            padding_mode=padding_mode)
+
+    def forward(self, inputs):
+        out_list = []
+        # [B, D*2, T]
+        cmp_spec = self.stft(inputs)
+        # [B, 1, D*2, T]
+        cmp_spec = torch.unsqueeze(cmp_spec, 1)
+
+        # to [B, 2, D, T] real_part/imag_part
+        cmp_spec = torch.cat([
+            cmp_spec[:, :, :self.feat_dim, :],
+            cmp_spec[:, :, self.feat_dim:, :],
+        ], 1)
+
+        # [B, 2, D, T]
+        cmp_spec = torch.unsqueeze(cmp_spec, 4)
+        # [B, 1, D, T, 2]
+        cmp_spec = torch.transpose(cmp_spec, 1, 4)
+        unet1_out = self.unet(cmp_spec)
+        cmp_mask1 = torch.tanh(unet1_out)
+        unet2_out = self.unet2(unet1_out)
+        cmp_mask2 = torch.tanh(unet2_out)
+        est_spec, est_wav, est_mask = self.apply_mask(cmp_spec, cmp_mask1)
+        out_list.append(est_spec)
+        out_list.append(est_wav)
+        out_list.append(est_mask)
+        cmp_mask2 = cmp_mask2 + cmp_mask1
+        est_spec, est_wav, est_mask = self.apply_mask(cmp_spec, cmp_mask2)
+        out_list.append(est_spec)
+        out_list.append(est_wav)
+        out_list.append(est_mask)
+        return out_list
+
+    def apply_mask(self, cmp_spec, cmp_mask):
+        est_spec = torch.cat([
+            cmp_spec[:, :, :, :, 0] * cmp_mask[:, :, :, :, 0]
+            - cmp_spec[:, :, :, :, 1] * cmp_mask[:, :, :, :, 1],
+            cmp_spec[:, :, :, :, 0] * cmp_mask[:, :, :, :, 1]
+            + cmp_spec[:, :, :, :, 1] * cmp_mask[:, :, :, :, 0]
+        ], 1)
+        est_spec = torch.cat([est_spec[:, 0, :, :], est_spec[:, 1, :, :]], 1)
+        cmp_mask = torch.squeeze(cmp_mask, 1)
+        cmp_mask = torch.cat([cmp_mask[:, :, :, 0], cmp_mask[:, :, :, 1]], 1)
+
+        est_wav = self.istft(est_spec)
+        est_wav = torch.squeeze(est_wav, 1)
+        return est_spec, est_wav, cmp_mask
+
+    def get_params(self, weight_decay=0.0):
+        # add L2 penalty
+        weights, biases = [], []
+        for name, param in self.named_parameters():
+            if 'bias' in name:
+                biases += [param]
+            else:
+                weights += [param]
+        params = [{
+            'params': weights,
+            'weight_decay': weight_decay,
+        }, {
+            'params': biases,
+            'weight_decay': 0.0,
+        }]
+        return params
+
+    def loss(self, noisy, labels, out_list, mode='Mix'):
+        if mode == 'SiSNR':
+            count = 0
+            while count < len(out_list):
+                est_spec = out_list[count]
+                count = count + 1
+                est_wav = out_list[count]
+                count = count + 1
+                est_mask = out_list[count]
+                count = count + 1
+                if count != 3:
+                    loss = self.loss_1layer(noisy, est_spec, est_wav, labels,
+                                            est_mask, mode)
+            return loss
+
+        elif mode == 'Mix':
+            count = 0
+            while count < len(out_list):
+                est_spec = out_list[count]
+                count = count + 1
+                est_wav = out_list[count]
+                count = count + 1
+                est_mask = out_list[count]
+                count = count + 1
+                if count != 3:
+                    amp_loss, phase_loss, SiSNR_loss = self.loss_1layer(
+                        noisy, est_spec, est_wav, labels, est_mask, mode)
+                    loss = amp_loss + phase_loss + SiSNR_loss
+            return loss, amp_loss, phase_loss
+
+    def loss_1layer(self, noisy, est, est_wav, labels, cmp_mask, mode='Mix'):
+        r""" Compute the loss by mode
+        mode == 'Mix'
+            est: [B, F*2, T]
+            labels: [B, F*2,T]
+        mode == 'SiSNR'
+            est: [B, T]
+            labels: [B, T]
+        """
+        if mode == 'SiSNR':
+            if labels.dim() == 3:
+                labels = torch.squeeze(labels, 1)
+            if est_wav.dim() == 3:
+                est_wav = torch.squeeze(est_wav, 1)
+            return -si_snr(est_wav, labels)
+        elif mode == 'Mix':
+
+            if labels.dim() == 3:
+                labels = torch.squeeze(labels, 1)
+            if est_wav.dim() == 3:
+                est_wav = torch.squeeze(est_wav, 1)
+            SiSNR_loss = -si_snr(est_wav, labels)
+
+            b, d, t = est.size()
+            S = self.stft(labels)
+            Sr = S[:, :self.feat_dim, :]
+            Si = S[:, self.feat_dim:, :]
+            Y = self.stft(noisy)
+            Yr = Y[:, :self.feat_dim, :]
+            Yi = Y[:, self.feat_dim:, :]
+            Y_pow = Yr**2 + Yi**2
+            gth_mask = torch.cat([(Sr * Yr + Si * Yi) / (Y_pow + 1e-8),
+                                  (Si * Yr - Sr * Yi) / (Y_pow + 1e-8)], 1)
+            gth_mask[gth_mask > 2] = 1
+            gth_mask[gth_mask < -2] = -1
+            amp_loss = F.mse_loss(gth_mask[:, :self.feat_dim, :],
+                                  cmp_mask[:, :self.feat_dim, :]) * d
+            phase_loss = F.mse_loss(gth_mask[:, self.feat_dim:, :],
+                                    cmp_mask[:, self.feat_dim:, :]) * d
+            return amp_loss, phase_loss, SiSNR_loss
+
+
+def l2_norm(s1, s2):
+    norm = torch.sum(s1 * s2, -1, keepdim=True)
+    return norm
+
+
+def si_snr(s1, s2, eps=1e-8):
+    s1_s2_norm = l2_norm(s1, s2)
+    s2_s2_norm = l2_norm(s2, s2)
+    s_target = s1_s2_norm / (s2_s2_norm + eps) * s2
+    e_nosie = s1 - s_target
+    target_norm = l2_norm(s_target, s_target)
+    noise_norm = l2_norm(e_nosie, e_nosie)
+    snr = 10 * torch.log10((target_norm) / (noise_norm + eps) + eps)
+    return torch.mean(snr)
diff --git a/modelscope/models/audio/ans/se_module_complex.py b/modelscope/models/audio/ans/se_module_complex.py
new file mode 100644
index 00000000..f62fe523
--- /dev/null
+++ b/modelscope/models/audio/ans/se_module_complex.py
@@ -0,0 +1,26 @@
+import torch
+from torch import nn
+
+
+class SELayer(nn.Module):
+
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc_r = nn.Sequential(
+            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel), nn.Sigmoid())
+        self.fc_i = nn.Sequential(
+            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel), nn.Sigmoid())
+
+    def forward(self, x):
+        b, c, _, _, _ = x.size()
+        x_r = self.avg_pool(x[:, :, :, :, 0]).view(b, c)
+        x_i = self.avg_pool(x[:, :, :, :, 1]).view(b, c)
+        y_r = self.fc_r(x_r).view(b, c, 1, 1, 1) - self.fc_i(x_i).view(
+            b, c, 1, 1, 1)
+        y_i = self.fc_r(x_i).view(b, c, 1, 1, 1) + self.fc_i(x_r).view(
+            b, c, 1, 1, 1)
+        y = torch.cat([y_r, y_i], 4)
+        return x * y
diff --git a/modelscope/models/audio/ans/unet.py b/modelscope/models/audio/ans/unet.py
new file mode 100644
index 00000000..aa5a4254
--- /dev/null
+++ b/modelscope/models/audio/ans/unet.py
@@ -0,0 +1,269 @@
+import torch
+import torch.nn as nn
+
+from . import complex_nn
+from .se_module_complex import SELayer
+
+
+class Encoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding=None,
+                 complex=False,
+                 padding_mode='zeros'):
+        super().__init__()
+        if padding is None:
+            padding = [(i - 1) // 2 for i in kernel_size]  # 'SAME' padding
+
+        if complex:
+            conv = complex_nn.ComplexConv2d
+            bn = complex_nn.ComplexBatchNorm2d
+        else:
+            conv = nn.Conv2d
+            bn = nn.BatchNorm2d
+
+        self.conv = conv(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode)
+        self.bn = bn(out_channels)
+        self.relu = nn.LeakyReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding=(0, 0),
+                 complex=False):
+        super().__init__()
+        if complex:
+            tconv = complex_nn.ComplexConvTranspose2d
+            bn = complex_nn.ComplexBatchNorm2d
+        else:
+            tconv = nn.ConvTranspose2d
+            bn = nn.BatchNorm2d
+
+        self.transconv = tconv(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+        self.bn = bn(out_channels)
+        self.relu = nn.LeakyReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.transconv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class UNet(nn.Module):
+
+    def __init__(self,
+                 input_channels=1,
+                 complex=False,
+                 model_complexity=45,
+                 model_depth=20,
+                 padding_mode='zeros'):
+        super().__init__()
+
+        if complex:
+            model_complexity = int(model_complexity // 1.414)
+
+        self.set_size(
+            model_complexity=model_complexity,
+            input_channels=input_channels,
+            model_depth=model_depth)
+        self.encoders = []
+        self.model_length = model_depth // 2
+        self.fsmn = complex_nn.ComplexUniDeepFsmn(128, 128, 128)
+        self.se_layers_enc = []
+        self.fsmn_enc = []
+        for i in range(self.model_length):
+            fsmn_enc = complex_nn.ComplexUniDeepFsmn_L1(128, 128, 128)
+            self.add_module('fsmn_enc{}'.format(i), fsmn_enc)
+            self.fsmn_enc.append(fsmn_enc)
+            module = Encoder(
+                self.enc_channels[i],
+                self.enc_channels[i + 1],
+                kernel_size=self.enc_kernel_sizes[i],
+                stride=self.enc_strides[i],
+                padding=self.enc_paddings[i],
+                complex=complex,
+                padding_mode=padding_mode)
+            self.add_module('encoder{}'.format(i), module)
+            self.encoders.append(module)
+            se_layer_enc = SELayer(self.enc_channels[i + 1], 8)
+            self.add_module('se_layer_enc{}'.format(i), se_layer_enc)
+            self.se_layers_enc.append(se_layer_enc)
+        self.decoders = []
+        self.fsmn_dec = []
+        self.se_layers_dec = []
+        for i in range(self.model_length):
+            fsmn_dec = complex_nn.ComplexUniDeepFsmn_L1(128, 128, 128)
+            self.add_module('fsmn_dec{}'.format(i), fsmn_dec)
+            self.fsmn_dec.append(fsmn_dec)
+            module = Decoder(
+                self.dec_channels[i] * 2,
+                self.dec_channels[i + 1],
+                kernel_size=self.dec_kernel_sizes[i],
+                stride=self.dec_strides[i],
+                padding=self.dec_paddings[i],
+                complex=complex)
+            self.add_module('decoder{}'.format(i), module)
+            self.decoders.append(module)
+            if i < self.model_length - 1:
+                se_layer_dec = SELayer(self.dec_channels[i + 1], 8)
+                self.add_module('se_layer_dec{}'.format(i), se_layer_dec)
+                self.se_layers_dec.append(se_layer_dec)
+        if complex:
+            conv = complex_nn.ComplexConv2d
+        else:
+            conv = nn.Conv2d
+
+        linear = conv(self.dec_channels[-1], 1, 1)
+
+        self.add_module('linear', linear)
+        self.complex = complex
+        self.padding_mode = padding_mode
+
+        self.decoders = nn.ModuleList(self.decoders)
+        self.encoders = nn.ModuleList(self.encoders)
+        self.se_layers_enc = nn.ModuleList(self.se_layers_enc)
+        self.se_layers_dec = nn.ModuleList(self.se_layers_dec)
+        self.fsmn_enc = nn.ModuleList(self.fsmn_enc)
+        self.fsmn_dec = nn.ModuleList(self.fsmn_dec)
+
+    def forward(self, inputs):
+        x = inputs
+        # go down
+        xs = []
+        xs_se = []
+        xs_se.append(x)
+        for i, encoder in enumerate(self.encoders):
+            xs.append(x)
+            if i > 0:
+                x = self.fsmn_enc[i](x)
+            x = encoder(x)
+            xs_se.append(self.se_layers_enc[i](x))
+        # xs : x0=input x1 ... x9
+        x = self.fsmn(x)
+
+        p = x
+        for i, decoder in enumerate(self.decoders):
+            p = decoder(p)
+            if i < self.model_length - 1:
+                p = self.fsmn_dec[i](p)
+            if i == self.model_length - 1:
+                break
+            if i < self.model_length - 2:
+                p = self.se_layers_dec[i](p)
+            p = torch.cat([p, xs_se[self.model_length - 1 - i]], dim=1)
+
+        # cmp_spec: [12, 1, 513, 64, 2]
+        cmp_spec = self.linear(p)
+        return cmp_spec
+
+    def set_size(self, model_complexity, model_depth=20, input_channels=1):
+
+        if model_depth == 14:
+            self.enc_channels = [
+                input_channels, 128, 128, 128, 128, 128, 128, 128
+            ]
+            self.enc_kernel_sizes = [(5, 2), (5, 2), (5, 2), (5, 2), (5, 2),
+                                     (5, 2), (2, 2)]
+            self.enc_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1),
+                                (2, 1)]
+            self.enc_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1),
+                                 (0, 1), (0, 1)]
+            self.dec_channels = [64, 128, 128, 128, 128, 128, 128, 1]
+            self.dec_kernel_sizes = [(2, 2), (5, 2), (5, 2), (5, 2), (6, 2),
+                                     (5, 2), (5, 2)]
+            self.dec_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1),
+                                (2, 1)]
+            self.dec_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1),
+                                 (0, 1), (0, 1)]
+
+        elif model_depth == 10:
+            self.enc_channels = [
+                input_channels,
+                16,
+                32,
+                64,
+                128,
+                256,
+            ]
+            self.enc_kernel_sizes = [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
+            self.enc_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]
+            self.enc_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]
+            self.dec_channels = [128, 128, 64, 32, 16, 1]
+            self.dec_kernel_sizes = [(3, 3), (3, 3), (3, 3), (4, 3), (3, 3)]
+            self.dec_strides = [(2, 1), (2, 1), (2, 1), (2, 1), (2, 1)]
+            self.dec_paddings = [(0, 1), (0, 1), (0, 1), (0, 1), (0, 1)]
+
+        elif model_depth == 20:
+            self.enc_channels = [
+                input_channels, model_complexity, model_complexity,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, 128
+            ]
+
+            self.enc_kernel_sizes = [(7, 1), (1, 7), (6, 4), (7, 5), (5, 3),
+                                     (5, 3), (5, 3), (5, 3), (5, 3), (5, 3)]
+
+            self.enc_strides = [(1, 1), (1, 1), (2, 2), (2, 1), (2, 2), (2, 1),
+                                (2, 2), (2, 1), (2, 2), (2, 1)]
+
+            self.enc_paddings = [
+                (3, 0),
+                (0, 3),
+                None,  # (0, 2),
+                None,
+                None,  # (3,1),
+                None,  # (3,1),
+                None,  # (1,2),
+                None,
+                None,
+                None
+            ]
+
+            self.dec_channels = [
+                0, model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2, model_complexity * 2,
+                model_complexity * 2
+            ]
+
+            self.dec_kernel_sizes = [(4, 3), (4, 2), (4, 3), (4, 2), (4, 3),
+                                     (4, 2), (6, 3), (7, 4), (1, 7), (7, 1)]
+
+            self.dec_strides = [(2, 1), (2, 2), (2, 1), (2, 2), (2, 1), (2, 2),
+                                (2, 1), (2, 2), (1, 1), (1, 1)]
+
+            self.dec_paddings = [(1, 1), (1, 0), (1, 1), (1, 0), (1, 1),
+                                 (1, 0), (2, 1), (2, 1), (0, 3), (3, 0)]
+        else:
+            raise ValueError('Unknown model depth : {}'.format(model_depth))
diff --git a/modelscope/pipelines/__init__.py b/modelscope/pipelines/__init__.py
index 14865872..74f5507f 100644
--- a/modelscope/pipelines/__init__.py
+++ b/modelscope/pipelines/__init__.py
@@ -1,4 +1,5 @@
 from .audio import LinearAECPipeline
+from .audio.ans_pipeline import ANSPipeline
 from .base import Pipeline
 from .builder import pipeline
 from .cv import *  # noqa F403
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
new file mode 100644
index 00000000..d9a04a29
--- /dev/null
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -0,0 +1,117 @@
+import os.path
+from typing import Any, Dict
+
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.utils.constant import Tasks
+from ..base import Input, Pipeline
+from ..builder import PIPELINES
+
+
+def audio_norm(x):
+    rms = (x**2).mean()**0.5
+    scalar = 10**(-25 / 20) / rms
+    x = x * scalar
+    pow_x = x**2
+    avg_pow_x = pow_x.mean()
+    rmsx = pow_x[pow_x > avg_pow_x].mean()**0.5
+    scalarx = 10**(-25 / 20) / rmsx
+    x = x * scalarx
+    return x
+
+
+@PIPELINES.register_module(
+    Tasks.speech_signal_process,
+    module_name=Pipelines.speech_frcrn_ans_cirm_16k)
+class ANSPipeline(Pipeline):
+    r"""ANS (Acoustic Noise Suppression) Inference Pipeline .
+
+    When invoke the class with pipeline.__call__(), it accept only one parameter:
+        inputs(str): the path of wav file
+    """
+    SAMPLE_RATE = 16000
+
+    def __init__(self, model):
+        r"""
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = self.model.to(self.device)
+        self.model.eval()
+
+    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+        assert isinstance(inputs, str) and os.path.exists(inputs) and os.path.isfile(inputs), \
+            f'Input file do not exists: {inputs}'
+        data1, fs = sf.read(inputs)
+        data1 = audio_norm(data1)
+        if fs != self.SAMPLE_RATE:
+            data1 = librosa.resample(data1, fs, self.SAMPLE_RATE)
+        if len(data1.shape) > 1:
+            data1 = data1[:, 0]
+        data = data1.astype(np.float32)
+        inputs = np.reshape(data, [1, data.shape[0]])
+        return {'ndarray': inputs, 'nsamples': data.shape[0]}
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        ndarray = inputs['ndarray']
+        nsamples = inputs['nsamples']
+        decode_do_segement = False
+        window = 16000
+        stride = int(window * 0.75)
+        print('inputs:{}'.format(ndarray.shape))
+        b, t = ndarray.shape  # size()
+        if t > window * 120:
+            decode_do_segement = True
+
+        if t < window:
+            ndarray = np.concatenate(
+                [ndarray, np.zeros((ndarray.shape[0], window - t))], 1)
+        elif t < window + stride:
+            padding = window + stride - t
+            print('padding: {}'.format(padding))
+            ndarray = np.concatenate(
+                [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
+        else:
+            if (t - window) % stride != 0:
+                padding = t - (t - window) // stride * stride
+                print('padding: {}'.format(padding))
+                ndarray = np.concatenate(
+                    [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
+        print('inputs after padding:{}'.format(ndarray.shape))
+        with torch.no_grad():
+            ndarray = torch.from_numpy(np.float32(ndarray)).to(self.device)
+            b, t = ndarray.shape
+            if decode_do_segement:
+                outputs = np.zeros(t)
+                give_up_length = (window - stride) // 2
+                current_idx = 0
+                while current_idx + window <= t:
+                    print('current_idx: {}'.format(current_idx))
+                    tmp_input = ndarray[:, current_idx:current_idx + window]
+                    tmp_output = self.model(
+                        tmp_input, )['wav_l2'][0].cpu().numpy()
+                    end_index = current_idx + window - give_up_length
+                    if current_idx == 0:
+                        outputs[current_idx:
+                                end_index] = tmp_output[:-give_up_length]
+                    else:
+                        outputs[current_idx
+                                + give_up_length:end_index] = tmp_output[
+                                    give_up_length:-give_up_length]
+                    current_idx += stride
+            else:
+                outputs = self.model(ndarray)['wav_l2'][0].cpu().numpy()
+        return {'output_pcm': outputs[:nsamples]}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        if 'output_path' in kwargs.keys():
+            sf.write(kwargs['output_path'], inputs['output_pcm'],
+                     self.SAMPLE_RATE)
+        return inputs
diff --git a/requirements/audio.txt b/requirements/audio.txt
index c7b2b239..1f5984ca 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -16,6 +16,7 @@ protobuf>3,<=3.20
 ptflops
 PyWavelets>=1.0.0
 scikit-learn
+SoundFile>0.10
 sox
 tensorboard
 tensorflow==1.15.*
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index bc3a542e..f317bc07 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -17,6 +17,9 @@ AEC_LIB_URL = 'http://isv-data.oss-cn-hangzhou.aliyuncs.com/ics%2FMaaS%2FAEC%2Fl
               '?Expires=1664085465&OSSAccessKeyId=LTAIxjQyZNde90zh&Signature=Y7gelmGEsQAJRK4yyHSYMrdWizk%3D'
 AEC_LIB_FILE = 'libmitaec_pyio.so'
 
+NOISE_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ANS/sample_audio/speech_with_noise.wav'
+NOISE_SPEECH_FILE = 'speech_with_noise.wav'
+
 
 def download(remote_path, local_path):
     local_dir = os.path.dirname(local_path)
@@ -30,23 +33,40 @@ def download(remote_path, local_path):
 class SpeechSignalProcessTest(unittest.TestCase):
 
     def setUp(self) -> None:
-        self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
+        pass
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_aec(self):
         # A temporary hack to provide c++ lib. Download it first.
         download(AEC_LIB_URL, AEC_LIB_FILE)
-
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run(self):
+        # Download audio files
         download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
         download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
+        model_id = 'damo/speech_dfsmn_aec_psm_16k'
         input = {
             'nearend_mic': NEAREND_MIC_FILE,
             'farend_speech': FAREND_SPEECH_FILE
         }
         aec = pipeline(
             Tasks.speech_signal_process,
-            model=self.model_id,
+            model=model_id,
             pipeline_name=Pipelines.speech_dfsmn_aec_psm_16k)
-        aec(input, output_path='output.wav')
+        output_path = os.path.abspath('output.wav')
+        aec(input, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_ans(self):
+        # Download audio files
+        download(NOISE_SPEECH_URL, NOISE_SPEECH_FILE)
+        model_id = 'damo/speech_frcrn_ans_cirm_16k'
+        ans = pipeline(
+            Tasks.speech_signal_process,
+            model=model_id,
+            pipeline_name=Pipelines.speech_frcrn_ans_cirm_16k)
+        output_path = os.path.abspath('output.wav')
+        ans(NOISE_SPEECH_FILE, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
 
 
 if __name__ == '__main__':

From 5da470fd5d8a8a91936a41b21ad6ab1ebb9f3ba0 Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Tue, 28 Jun 2022 20:40:57 +0800
Subject: [PATCH 5/9] [to #42791465, #42779255, #42777959,  #42757844,
 #42756050, #42746916, #42743595, #42791863] fix: fix msdataset

 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9174075

* fix msdataset
---
 modelscope/hub/errors.py                    | 15 ++++++
 modelscope/msdatasets/config.py             |  2 +-
 modelscope/msdatasets/ms_dataset.py         | 56 +++++++++++++++------
 modelscope/msdatasets/utils/ms_api.py       | 48 ++++++++++++------
 modelscope/utils/constant.py                | 10 +++-
 tests/msdatasets/test_ms_dataset.py         | 24 +++++----
 tests/pipelines/test_image_matting.py       |  3 +-
 tests/pipelines/test_text_classification.py |  8 ++-
 8 files changed, 121 insertions(+), 45 deletions(-)

diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index 4b39d6e3..d39036a0 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -32,3 +32,18 @@ def raise_on_error(rsp):
         return True
     else:
         raise RequestError(rsp['Message'])
+
+
+# TODO use raise_on_error instead if modelhub and datahub response have uniform structures,
+def datahub_raise_on_error(url, rsp):
+    """If response error, raise exception
+
+    Args:
+        rsp (_type_): The server response
+    """
+    if rsp.get('Code') == 200:
+        return True
+    else:
+        raise RequestError(
+            f"Url = {url}, Status = {rsp.get('status')}, error = {rsp.get('error')}, message = {rsp.get('message')}"
+        )
diff --git a/modelscope/msdatasets/config.py b/modelscope/msdatasets/config.py
index e916b3ec..00c24c3a 100644
--- a/modelscope/msdatasets/config.py
+++ b/modelscope/msdatasets/config.py
@@ -19,4 +19,4 @@ DOWNLOADED_DATASETS_PATH = Path(
     os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))
 
 MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
-                                 'http://101.201.119.157:31752')
+                                 'http://123.57.189.90:31752')
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 0466894c..90964b36 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -3,7 +3,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import numpy as np
-from datasets import Dataset
+from datasets import Dataset, DatasetDict
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
@@ -12,7 +12,7 @@ from datasets.utils.file_utils import (is_relative_path,
 
 from modelscope.msdatasets.config import MS_DATASETS_CACHE
 from modelscope.msdatasets.utils.ms_api import MsApi
-from modelscope.utils.constant import Hubs
+from modelscope.utils.constant import DownloadMode, Hubs
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -34,6 +34,10 @@ class MsDataset:
 
     def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
         self._hf_ds = hf_ds
+        if target is not None and target not in self._hf_ds.features:
+            raise TypeError(
+                f'"target" must be a column of the dataset({list(self._hf_ds.features.keys())}, but got {target}'
+            )
         self.target = target
 
     def __iter__(self):
@@ -48,17 +52,23 @@ class MsDataset:
 
     @classmethod
     def from_hf_dataset(cls,
-                        hf_ds: Dataset,
+                        hf_ds: Union[Dataset, DatasetDict],
                         target: str = None) -> Union[dict, 'MsDataset']:
         if isinstance(hf_ds, Dataset):
             return cls(hf_ds, target)
-        if len(hf_ds.keys()) == 1:
-            return cls(next(iter(hf_ds.values())), target)
-        return {k: cls(v, target) for k, v in hf_ds.items()}
+        elif isinstance(hf_ds, DatasetDict):
+            if len(hf_ds.keys()) == 1:
+                return cls(next(iter(hf_ds.values())), target)
+            return {k: cls(v, target) for k, v in hf_ds.items()}
+        else:
+            raise TypeError(
+                f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
+            )
 
     @staticmethod
     def load(
         dataset_name: Union[str, list],
+        namespace: Optional[str] = None,
         target: Optional[str] = None,
         version: Optional[str] = None,
         hub: Optional[Hubs] = Hubs.modelscope,
@@ -67,23 +77,32 @@ class MsDataset:
         data_dir: Optional[str] = None,
         data_files: Optional[Union[str, Sequence[str],
                                    Mapping[str, Union[str,
-                                                      Sequence[str]]]]] = None
+                                                      Sequence[str]]]]] = None,
+        download_mode: Optional[DownloadMode] = DownloadMode.
+        REUSE_DATASET_IF_EXISTS
     ) -> Union[dict, 'MsDataset']:
         """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
             Args:
 
                 dataset_name (str): Path or name of the dataset.
+                namespace(str, optional): Namespace of the dataset. It should not be None, if you load a remote dataset
+                from Hubs.modelscope,
                 target (str, optional): Name of the column to output.
                 version (str, optional): Version of the dataset script to load:
                 subset_name (str, optional): Defining the subset_name of the dataset.
                 data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                 data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                 split (str, optional): Which split of the data to load.
-                hub (Hubs, optional): When loading from a remote hub, where it is from
+                hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
+                download_mode (DownloadMode or str, optional): How to treat existing datasets. default
+                                                               DownloadMode.REUSE_DATASET_IF_EXISTS
 
             Returns:
                 MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
             """
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        hub = Hubs(hub or Hubs.modelscope)
         if hub == Hubs.huggingface:
             dataset = hf_load_dataset(
                 dataset_name,
@@ -91,21 +110,25 @@ class MsDataset:
                 revision=version,
                 split=split,
                 data_dir=data_dir,
-                data_files=data_files)
+                data_files=data_files,
+                download_mode=download_mode.value)
             return MsDataset.from_hf_dataset(dataset, target=target)
-        else:
+        elif hub == Hubs.modelscope:
             return MsDataset._load_ms_dataset(
                 dataset_name,
+                namespace=namespace,
                 target=target,
                 subset_name=subset_name,
                 version=version,
                 split=split,
                 data_dir=data_dir,
-                data_files=data_files)
+                data_files=data_files,
+                download_mode=download_mode)
 
     @staticmethod
     def _load_ms_dataset(
         dataset_name: Union[str, list],
+        namespace: Optional[str] = None,
         target: Optional[str] = None,
         version: Optional[str] = None,
         subset_name: Optional[str] = None,
@@ -113,17 +136,19 @@ class MsDataset:
         data_dir: Optional[str] = None,
         data_files: Optional[Union[str, Sequence[str],
                                    Mapping[str, Union[str,
-                                                      Sequence[str]]]]] = None
+                                                      Sequence[str]]]]] = None,
+        download_mode: Optional[DownloadMode] = None
     ) -> Union[dict, 'MsDataset']:
         if isinstance(dataset_name, str):
             use_hf = False
             if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
                     (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
                 use_hf = True
-            elif is_relative_path(dataset_name):
+            elif is_relative_path(dataset_name) and dataset_name.count(
+                    '/') == 0:
                 ms_api = MsApi()
                 dataset_scripts = ms_api.fetch_dataset_scripts(
-                    dataset_name, version)
+                    dataset_name, namespace, download_mode, version)
                 if 'py' in dataset_scripts:  # dataset copied from hf datasets
                     dataset_name = dataset_scripts['py'][0]
                     use_hf = True
@@ -140,7 +165,8 @@ class MsDataset:
                     split=split,
                     data_dir=data_dir,
                     data_files=data_files,
-                    cache_dir=MS_DATASETS_CACHE)
+                    cache_dir=MS_DATASETS_CACHE,
+                    download_mode=download_mode.value)
             else:
                 # TODO load from ms datahub
                 raise NotImplementedError(
diff --git a/modelscope/msdatasets/utils/ms_api.py b/modelscope/msdatasets/utils/ms_api.py
index fc3bcca2..c9b49ca1 100644
--- a/modelscope/msdatasets/utils/ms_api.py
+++ b/modelscope/msdatasets/utils/ms_api.py
@@ -1,11 +1,14 @@
 import os
+import shutil
 from collections import defaultdict
 from typing import Optional
 
 import requests
 
+from modelscope.hub.errors import NotExistError, datahub_raise_on_error
 from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
                                           MS_HUB_ENDPOINT)
+from modelscope.utils.constant import DownloadMode
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,23 +30,38 @@ class MsApi:
 
     def fetch_dataset_scripts(self,
                               dataset_name: str,
-                              version: Optional[str] = 'master',
-                              force_download=False):
-        datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
-        r = requests.get(datahub_url)
-        r.raise_for_status()
-        dataset_list = r.json()['Data']
-        if len(dataset_list) == 0:
-            return None
-        dataset_id = dataset_list[0]['Id']
+                              namespace: str,
+                              download_mode: Optional[DownloadMode],
+                              version: Optional[str] = 'master'):
+        if namespace is None:
+            raise ValueError(
+                f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
+            )
         version = version or 'master'
-        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
-        r = requests.get(datahub_url)
-        r.raise_for_status()
-        file_list = r.json()['Data']['Files']
         cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
-                                 version)
+                                 namespace, version)
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
+                cache_dir):
+            shutil.rmtree(cache_dir)
         os.makedirs(cache_dir, exist_ok=True)
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
+        r = requests.get(datahub_url)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp)
+        dataset_id = resp['Data']['Id']
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
+        r = requests.get(datahub_url)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp)
+        file_list = resp['Data']
+        if file_list is None:
+            raise NotExistError(
+                f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
+                f'version = {version}] dose not exist')
+
+        file_list = file_list['Files']
         local_paths = defaultdict(list)
         for file_info in file_list:
             file_path = file_info['Path']
@@ -54,7 +72,7 @@ class MsApi:
                 r.raise_for_status()
                 content = r.json()['Data']['Content']
                 local_path = os.path.join(cache_dir, file_path)
-                if os.path.exists(local_path) and not force_download:
+                if os.path.exists(local_path):
                     logger.warning(
                         f"Reusing dataset {dataset_name}'s python file ({local_path})"
                     )
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index f2215359..55f015e8 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import enum
 
 
 class Fields(object):
@@ -69,13 +70,20 @@ class InputFields(object):
     audio = 'audio'
 
 
-class Hubs(object):
+class Hubs(enum.Enum):
     """ Source from which an entity (such as a Dataset or Model) is stored
     """
     modelscope = 'modelscope'
     huggingface = 'huggingface'
 
 
+class DownloadMode(enum.Enum):
+    """ How to treat existing datasets
+    """
+    REUSE_DATASET_IF_EXISTS = 'reuse_dataset_if_exists'
+    FORCE_REDOWNLOAD = 'force_redownload'
+
+
 class ModelFile(object):
     CONFIGURATION = 'configuration.json'
     README = 'README.md'
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index de413d5f..50767fd8 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -32,11 +32,12 @@ class ImgPreprocessor(Preprocessor):
 
 class MsDatasetTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_basic(self):
-        ms_ds_full = MsDataset.load('squad')
+        ms_ds_full = MsDataset.load('squad', namespace='damotest')
         ms_ds_full_hf = hfdata.load_dataset('squad')
-        ms_ds_train = MsDataset.load('squad', split='train')
+        ms_ds_train = MsDataset.load(
+            'squad', namespace='damotest', split='train')
         ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
         ms_image_train = MsDataset.from_hf_dataset(
             hfdata.load_dataset('beans', split='train'))
@@ -48,7 +49,7 @@ class MsDatasetTest(unittest.TestCase):
         print(next(iter(ms_ds_train)))
         print(next(iter(ms_image_train)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     @require_torch
     def test_to_torch_dataset_text(self):
         model_id = 'damo/bert-base-sst2'
@@ -57,13 +58,14 @@ class MsDatasetTest(unittest.TestCase):
             nlp_model.model_dir,
             first_sequence='context',
             second_sequence=None)
-        ms_ds_train = MsDataset.load('squad', split='train')
+        ms_ds_train = MsDataset.load(
+            'squad', namespace='damotest', split='train')
         pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
         import torch
         dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
         print(next(iter(dataloader)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     @require_tf
     def test_to_tf_dataset_text(self):
         import tensorflow as tf
@@ -74,7 +76,8 @@ class MsDatasetTest(unittest.TestCase):
             nlp_model.model_dir,
             first_sequence='context',
             second_sequence=None)
-        ms_ds_train = MsDataset.load('squad', split='train')
+        ms_ds_train = MsDataset.load(
+            'squad', namespace='damotest', split='train')
         tf_dataset = ms_ds_train.to_tf_dataset(
             batch_size=5,
             shuffle=True,
@@ -85,8 +88,8 @@ class MsDatasetTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     @require_torch
     def test_to_torch_dataset_img(self):
-        ms_image_train = MsDataset.from_hf_dataset(
-            hfdata.load_dataset('beans', split='train'))
+        ms_image_train = MsDataset.load(
+            'beans', namespace='damotest', split='train')
         pt_dataset = ms_image_train.to_torch_dataset(
             preprocessors=ImgPreprocessor(
                 image_path='image_file_path', label='labels'))
@@ -99,7 +102,8 @@ class MsDatasetTest(unittest.TestCase):
     def test_to_tf_dataset_img(self):
         import tensorflow as tf
         tf.compat.v1.enable_eager_execution()
-        ms_image_train = MsDataset.load('beans', split='train')
+        ms_image_train = MsDataset.load(
+            'beans', namespace='damotest', split='train')
         tf_dataset = ms_image_train.to_tf_dataset(
             batch_size=5,
             shuffle=True,
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index de60ff0b..48a715f1 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -62,7 +62,8 @@ class ImageMattingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_modelscope_dataset(self):
-        dataset = MsDataset.load('beans', split='train', target='image')
+        dataset = MsDataset.load(
+            'beans', namespace='damotest', split='train', target='image')
         img_matting = pipeline(Tasks.image_matting, model=self.model_id)
         result = img_matting(dataset)
         for i in range(10):
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index f913490c..1bf9f7ca 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -87,12 +87,16 @@ class SequenceClassificationTest(unittest.TestCase):
         result = text_classification(dataset)
         self.printDataset(result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_modelscope_dataset(self):
         text_classification = pipeline(task=Tasks.text_classification)
         # loaded from modelscope dataset
         dataset = MsDataset.load(
-            'squad', split='train', target='context', hub=Hubs.modelscope)
+            'squad',
+            namespace='damotest',
+            split='train',
+            target='context',
+            hub=Hubs.modelscope)
         result = text_classification(dataset)
         self.printDataset(result)
 

From 0d17eb5b395b0d1a74e1a10ad754843bd6dfc71b Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 28 Jun 2022 21:12:15 +0800
Subject: [PATCH 6/9] [to #42849800 #42822853 #42822836 #42822791 #42822717
 #42820011]fix: bug test bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复测试bug
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9186775

    * [to #42849800 #42822853 #42822836 #42822791 #42822717 #42820011]fix: test bugs
---
 modelscope/hub/api.py                    | 84 ++++++++++++++++-------
 modelscope/hub/errors.py                 |  4 ++
 modelscope/hub/file_download.py          | 16 +++--
 modelscope/hub/git.py                    |  8 +++
 modelscope/hub/repository.py             | 12 ++--
 modelscope/hub/snapshot_download.py      | 16 ++---
 modelscope/hub/utils/caching.py          |  8 ++-
 modelscope/utils/hub.py                  |  5 +-
 tests/hub/test_hub_operation.py          | 42 ++++++++++--
 tests/hub/test_hub_private_files.py      | 85 ++++++++++++++++++++++++
 tests/hub/test_hub_private_repository.py |  9 ++-
 tests/hub/test_hub_repository.py         | 24 ++-----
 12 files changed, 235 insertions(+), 78 deletions(-)
 create mode 100644 tests/hub/test_hub_private_files.py

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index d102219b..e79bfd41 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -9,7 +9,7 @@ import requests
 
 from modelscope.utils.logger import get_logger
 from .constants import MODELSCOPE_URL_SCHEME
-from .errors import NotExistError, is_ok, raise_on_error
+from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error
 from .utils.utils import (get_endpoint, get_gitlab_domain,
                           model_id_to_group_owner_name)
 
@@ -61,17 +61,21 @@ class HubApi:
 
         return d['Data']['AccessToken'], cookies
 
-    def create_model(self, model_id: str, chinese_name: str, visibility: int,
-                     license: str) -> str:
+    def create_model(
+        self,
+        model_id: str,
+        visibility: str,
+        license: str,
+        chinese_name: Optional[str] = None,
+    ) -> str:
         """
         Create model repo at ModelScopeHub
 
         Args:
             model_id:(`str`): The model id
-            chinese_name(`str`): chinese name of the model
-            visibility(`int`): visibility of the model(1-private, 3-internal, 5-public)
-            license(`str`): license of the model, candidates can be found at: TBA
-
+            visibility(`int`): visibility of the model(1-private, 5-public), default public.
+            license(`str`): license of the model, default none.
+            chinese_name(`str`, *optional*): chinese name of the model
         Returns:
             name of the model created
 
@@ -79,6 +83,8 @@ class HubApi:
             model_id = {owner}/{name}
         </Tip>
         """
+        if model_id is None:
+            raise InvalidParameter('model_id is required!')
         cookies = ModelScopeConfig.get_cookies()
         if cookies is None:
             raise ValueError('Token does not exist, please login first.')
@@ -151,11 +157,33 @@ class HubApi:
         else:
             r.raise_for_status()
 
+    def _check_cookie(self,
+                      use_cookies: Union[bool,
+                                         CookieJar] = False) -> CookieJar:
+        cookies = None
+        if isinstance(use_cookies, CookieJar):
+            cookies = use_cookies
+        elif use_cookies:
+            cookies = ModelScopeConfig.get_cookies()
+            if cookies is None:
+                raise ValueError('Token does not exist, please login first.')
+        return cookies
+
     def get_model_branches_and_tags(
         self,
         model_id: str,
+        use_cookies: Union[bool, CookieJar] = False
     ) -> Tuple[List[str], List[str]]:
-        cookies = ModelScopeConfig.get_cookies()
+        """Get model branch and tags.
+
+        Args:
+            model_id (str): The model id
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
+                        will load cookie from local. Defaults to False.
+        Returns:
+            Tuple[List[str], List[str]]: _description_
+        """
+        cookies = self._check_cookie(use_cookies)
 
         path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
         r = requests.get(path, cookies=cookies)
@@ -169,23 +197,33 @@ class HubApi:
                 ] if info['RevisionMap']['Tags'] else []
         return branches, tags
 
-    def get_model_files(
-            self,
-            model_id: str,
-            revision: Optional[str] = 'master',
-            root: Optional[str] = None,
-            recursive: Optional[str] = False,
-            use_cookies: Union[bool, CookieJar] = False) -> List[dict]:
+    def get_model_files(self,
+                        model_id: str,
+                        revision: Optional[str] = 'master',
+                        root: Optional[str] = None,
+                        recursive: Optional[str] = False,
+                        use_cookies: Union[bool, CookieJar] = False,
+                        is_snapshot: Optional[bool] = True) -> List[dict]:
+        """List the models files.
 
-        cookies = None
-        if isinstance(use_cookies, CookieJar):
-            cookies = use_cookies
-        elif use_cookies:
-            cookies = ModelScopeConfig.get_cookies()
-            if cookies is None:
-                raise ValueError('Token does not exist, please login first.')
+        Args:
+            model_id (str): The model id
+            revision (Optional[str], optional): The branch or tag name. Defaults to 'master'.
+            root (Optional[str], optional): The root path. Defaults to None.
+            recursive (Optional[str], optional): Is recurive list files. Defaults to False.
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
+                        will load cookie from local. Defaults to False.
+            is_snapshot(Optional[bool], optional): when snapshot_download set to True, otherwise False.
 
-        path = f'{self.endpoint}/api/v1/models/{model_id}/repo/files?Revision={revision}&Recursive={recursive}'
+        Raises:
+            ValueError: If user_cookies is True, but no local cookie.
+
+        Returns:
+            List[dict]: Model file list.
+        """
+        path = '%s/api/v1/models/%s/repo/files?Revision=%s&Recursive=%s&Snapshot=%s' % (
+            self.endpoint, model_id, revision, recursive, is_snapshot)
+        cookies = self._check_cookie(use_cookies)
         if root is not None:
             path = path + f'&Root={root}'
 
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index d39036a0..9a19fdb5 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -10,6 +10,10 @@ class GitError(Exception):
     pass
 
 
+class InvalidParameter(Exception):
+    pass
+
+
 def is_ok(rsp):
     """ Check the request is ok
 
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index b92bf89c..60aae3b6 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -7,6 +7,7 @@ import tempfile
 import time
 from functools import partial
 from hashlib import sha256
+from http.cookiejar import CookieJar
 from pathlib import Path
 from typing import BinaryIO, Dict, Optional, Union
 from uuid import uuid4
@@ -107,7 +108,9 @@ def model_file_download(
 
     _api = HubApi()
     headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
-    branches, tags = _api.get_model_branches_and_tags(model_id)
+    cookies = ModelScopeConfig.get_cookies()
+    branches, tags = _api.get_model_branches_and_tags(
+        model_id, use_cookies=False if cookies is None else cookies)
     file_to_download_info = None
     is_commit_id = False
     if revision in branches or revision in tags:  # The revision is version or tag,
@@ -117,18 +120,19 @@ def model_file_download(
             model_id=model_id,
             revision=revision,
             recursive=True,
-        )
+            use_cookies=False if cookies is None else cookies,
+            is_snapshot=False)
 
         for model_file in model_files:
             if model_file['Type'] == 'tree':
                 continue
 
             if model_file['Path'] == file_path:
-                model_file['Branch'] = revision
                 if cache.exists(model_file):
                     return cache.get_file_by_info(model_file)
                 else:
                     file_to_download_info = model_file
+                break
 
         if file_to_download_info is None:
             raise NotExistError('The file path: %s not exist in: %s' %
@@ -141,8 +145,6 @@ def model_file_download(
             return cached_file_path  # the file is in cache.
         is_commit_id = True
     # we need to download again
-    # TODO: skip using JWT for authorization, use cookie instead
-    cookies = ModelScopeConfig.get_cookies()
     url_to_download = get_file_download_url(model_id, file_path, revision)
     file_to_download_info = {
         'Path': file_path,
@@ -202,7 +204,7 @@ def http_get_file(
     url: str,
     local_dir: str,
     file_name: str,
-    cookies: Dict[str, str],
+    cookies: CookieJar,
     headers: Optional[Dict[str, str]] = None,
 ):
     """
@@ -217,7 +219,7 @@ def http_get_file(
             local directory where the downloaded file stores
         file_name(`str`):
             name of the file stored in `local_dir`
-        cookies(`Dict[str, str]`):
+        cookies(`CookieJar`):
             cookies used to authentication the user, which is used for downloading private repos
         headers(`Optional[Dict[str, str]] = None`):
             http headers to carry necessary info when requesting the remote file
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 37f61814..54161f1c 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -70,6 +70,14 @@ class GitCommandWrapper(metaclass=Singleton):
         except GitError:
             return False
 
+    def git_lfs_install(self, repo_dir):
+        cmd = ['git', '-C', repo_dir, 'lfs', 'install']
+        try:
+            self._run_git_command(*cmd)
+            return True
+        except GitError:
+            return False
+
     def clone(self,
               repo_base_dir: str,
               token: str,
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index d9322144..37dec571 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -1,7 +1,7 @@
 import os
 from typing import List, Optional
 
-from modelscope.hub.errors import GitError
+from modelscope.hub.errors import GitError, InvalidParameter
 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .constants import MODELSCOPE_URL_SCHEME
@@ -49,6 +49,8 @@ class Repository:
         git_wrapper = GitCommandWrapper()
         if not git_wrapper.is_lfs_installed():
             logger.error('git lfs is not installed, please install.')
+        else:
+            git_wrapper.git_lfs_install(self.model_dir)  # init repo lfs
 
         self.git_wrapper = GitCommandWrapper(git_path)
         os.makedirs(self.model_dir, exist_ok=True)
@@ -74,8 +76,6 @@ class Repository:
 
     def push(self,
              commit_message: str,
-             files: List[str] = list(),
-             all_files: bool = False,
              branch: Optional[str] = 'master',
              force: bool = False):
         """Push local to remote, this method will do.
@@ -86,8 +86,12 @@ class Repository:
             commit_message (str): commit message
             revision (Optional[str], optional): which branch to push. Defaults to 'master'.
         """
+        if commit_message is None:
+            msg = 'commit_message must be provided!'
+            raise InvalidParameter(msg)
         url = self.git_wrapper.get_repo_remote_url(self.model_dir)
-        self.git_wrapper.add(self.model_dir, files, all_files)
+        self.git_wrapper.pull(self.model_dir)
+        self.git_wrapper.add(self.model_dir, all_files=True)
         self.git_wrapper.commit(self.model_dir, commit_message)
         self.git_wrapper.push(
             repo_dir=self.model_dir,
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index 90d850f4..91463f76 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -20,8 +20,7 @@ def snapshot_download(model_id: str,
                       revision: Optional[str] = 'master',
                       cache_dir: Union[str, Path, None] = None,
                       user_agent: Optional[Union[Dict, str]] = None,
-                      local_files_only: Optional[bool] = False,
-                      private: Optional[bool] = False) -> str:
+                      local_files_only: Optional[bool] = False) -> str:
     """Download all files of a repo.
     Downloads a whole snapshot of a repo's files at the specified revision. This
     is useful when you want all files from a repo, because you don't know which
@@ -79,8 +78,10 @@ def snapshot_download(model_id: str,
         # make headers
         headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
         _api = HubApi()
+        cookies = ModelScopeConfig.get_cookies()
         # get file list from model repo
-        branches, tags = _api.get_model_branches_and_tags(model_id)
+        branches, tags = _api.get_model_branches_and_tags(
+            model_id, use_cookies=False if cookies is None else cookies)
         if revision not in branches and revision not in tags:
             raise NotExistError('The specified branch or tag : %s not exist!'
                                 % revision)
@@ -89,11 +90,8 @@ def snapshot_download(model_id: str,
             model_id=model_id,
             revision=revision,
             recursive=True,
-            use_cookies=private)
-
-        cookies = None
-        if private:
-            cookies = ModelScopeConfig.get_cookies()
+            use_cookies=False if cookies is None else cookies,
+            is_snapshot=True)
 
         for model_file in model_files:
             if model_file['Type'] == 'tree':
@@ -116,7 +114,7 @@ def snapshot_download(model_id: str,
                 local_dir=tempfile.gettempdir(),
                 file_name=model_file['Name'],
                 headers=headers,
-                cookies=None if cookies is None else cookies.get_dict())
+                cookies=cookies)
             # put file to cache
             cache.put_file(
                 model_file,
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index ac258385..7675e49b 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -101,8 +101,9 @@ class FileSystemCache(object):
         Args:
             key (dict): The cache key.
         """
-        self.cached_files.remove(key)
-        self.save_cached_files()
+        if key in self.cached_files:
+            self.cached_files.remove(key)
+            self.save_cached_files()
 
     def exists(self, key):
         for cache_file in self.cached_files:
@@ -204,6 +205,7 @@ class ModelFileSystemCache(FileSystemCache):
                     return orig_path
                 else:
                     self.remove_key(cached_file)
+                    break
 
         return None
 
@@ -230,6 +232,7 @@ class ModelFileSystemCache(FileSystemCache):
                     cached_key['Revision'].startswith(key['Revision'])
                     or key['Revision'].startswith(cached_key['Revision'])):
                 is_exists = True
+                break
         file_path = os.path.join(self.cache_root_location,
                                  model_file_info['Path'])
         if is_exists:
@@ -253,6 +256,7 @@ class ModelFileSystemCache(FileSystemCache):
                                          cached_file['Path'])
                 if os.path.exists(file_path):
                     os.remove(file_path)
+                break
 
     def put_file(self, model_file_info, model_file_location):
         """Put model on model_file_location to cache, the model first download to /tmp, and move to cache.
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index c427b7a3..3b7e80ef 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -31,9 +31,10 @@ def create_model_if_not_exist(
     else:
         api.create_model(
             model_id=model_id,
-            chinese_name=chinese_name,
             visibility=visibility,
-            license=license)
+            license=license,
+            chinese_name=chinese_name,
+        )
         print(f'model {model_id} successfully created.')
         return True
 
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
index 035b183e..d193ce32 100644
--- a/tests/hub/test_hub_operation.py
+++ b/tests/hub/test_hub_operation.py
@@ -3,6 +3,7 @@ import os
 import tempfile
 import unittest
 import uuid
+from shutil import rmtree
 
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import Licenses, ModelVisibility
@@ -23,7 +24,6 @@ download_model_file_name = 'test.bin'
 class HubOperationTest(unittest.TestCase):
 
     def setUp(self):
-        self.old_cwd = os.getcwd()
         self.api = HubApi()
         # note this is temporary before official account management is ready
         self.api.login(USER_NAME, PASSWORD)
@@ -31,19 +31,18 @@ class HubOperationTest(unittest.TestCase):
         self.model_id = '%s/%s' % (model_org, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
-            chinese_name=model_chinese_name,
             visibility=ModelVisibility.PUBLIC,
-            license=Licenses.APACHE_V2)
+            license=Licenses.APACHE_V2,
+            chinese_name=model_chinese_name,
+        )
         temporary_dir = tempfile.mkdtemp()
         self.model_dir = os.path.join(temporary_dir, self.model_name)
         repo = Repository(self.model_dir, clone_from=self.model_id)
-        os.chdir(self.model_dir)
         os.system("echo 'testtest'>%s"
-                  % os.path.join(self.model_dir, 'test.bin'))
-        repo.push('add model', all_files=True)
+                  % os.path.join(self.model_dir, download_model_file_name))
+        repo.push('add model')
 
     def tearDown(self):
-        os.chdir(self.old_cwd)
         self.api.delete_model(model_id=self.model_id)
 
     def test_model_repo_creation(self):
@@ -79,6 +78,35 @@ class HubOperationTest(unittest.TestCase):
         mdtime2 = os.path.getmtime(downloaded_file_path)
         assert mdtime1 == mdtime2
 
+    def test_download_public_without_login(self):
+        rmtree(ModelScopeConfig.path_credential)
+        snapshot_path = snapshot_download(model_id=self.model_id)
+        downloaded_file_path = os.path.join(snapshot_path,
+                                            download_model_file_name)
+        assert os.path.exists(downloaded_file_path)
+        temporary_dir = tempfile.mkdtemp()
+        downloaded_file = model_file_download(
+            model_id=self.model_id,
+            file_path=download_model_file_name,
+            cache_dir=temporary_dir)
+        assert os.path.exists(downloaded_file)
+        self.api.login(USER_NAME, PASSWORD)
+
+    def test_snapshot_delete_download_cache_file(self):
+        snapshot_path = snapshot_download(model_id=self.model_id)
+        downloaded_file_path = os.path.join(snapshot_path,
+                                            download_model_file_name)
+        assert os.path.exists(downloaded_file_path)
+        os.remove(downloaded_file_path)
+        # download again in cache
+        file_download_path = model_file_download(
+            model_id=self.model_id, file_path='README.md')
+        assert os.path.exists(file_download_path)
+        # deleted file need download again
+        file_download_path = model_file_download(
+            model_id=self.model_id, file_path=download_model_file_name)
+        assert os.path.exists(file_download_path)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/hub/test_hub_private_files.py b/tests/hub/test_hub_private_files.py
new file mode 100644
index 00000000..b9c71456
--- /dev/null
+++ b/tests/hub/test_hub_private_files.py
@@ -0,0 +1,85 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import tempfile
+import unittest
+import uuid
+
+from requests.exceptions import HTTPError
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.errors import GitError
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.repository import Repository
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.utils.constant import ModelFile
+
+USER_NAME = 'maasadmin'
+PASSWORD = '12345678'
+USER_NAME2 = 'sdkdev'
+
+model_chinese_name = '达摩卡通化模型'
+model_org = 'unittest'
+
+
+class HubPrivateFileDownloadTest(unittest.TestCase):
+
+    def setUp(self):
+        self.old_cwd = os.getcwd()
+        self.api = HubApi()
+        # note this is temporary before official account management is ready
+        self.token, _ = self.api.login(USER_NAME, PASSWORD)
+        self.model_name = uuid.uuid4().hex
+        self.model_id = '%s/%s' % (model_org, self.model_name)
+        self.api.create_model(
+            model_id=self.model_id,
+            visibility=ModelVisibility.PRIVATE,  # 1-private, 5-public
+            license=Licenses.APACHE_V2,
+            chinese_name=model_chinese_name,
+        )
+
+    def tearDown(self):
+        os.chdir(self.old_cwd)
+        self.api.delete_model(model_id=self.model_id)
+
+    def test_snapshot_download_private_model(self):
+        snapshot_path = snapshot_download(self.model_id)
+        assert os.path.exists(os.path.join(snapshot_path, ModelFile.README))
+
+    def test_snapshot_download_private_model_no_permission(self):
+        self.token, _ = self.api.login(USER_NAME2, PASSWORD)
+        with self.assertRaises(HTTPError):
+            snapshot_download(self.model_id)
+        self.api.login(USER_NAME, PASSWORD)
+
+    def test_download_file_private_model(self):
+        file_path = model_file_download(self.model_id, ModelFile.README)
+        assert os.path.exists(file_path)
+
+    def test_download_file_private_model_no_permission(self):
+        self.token, _ = self.api.login(USER_NAME2, PASSWORD)
+        with self.assertRaises(HTTPError):
+            model_file_download(self.model_id, ModelFile.README)
+        self.api.login(USER_NAME, PASSWORD)
+
+    def test_snapshot_download_local_only(self):
+        with self.assertRaises(ValueError):
+            snapshot_download(self.model_id, local_files_only=True)
+        snapshot_path = snapshot_download(self.model_id)
+        assert os.path.exists(os.path.join(snapshot_path, ModelFile.README))
+        snapshot_path = snapshot_download(self.model_id, local_files_only=True)
+        assert os.path.exists(snapshot_path)
+
+    def test_file_download_local_only(self):
+        with self.assertRaises(ValueError):
+            model_file_download(
+                self.model_id, ModelFile.README, local_files_only=True)
+        file_path = model_file_download(self.model_id, ModelFile.README)
+        assert os.path.exists(file_path)
+        file_path = model_file_download(
+            self.model_id, ModelFile.README, local_files_only=True)
+        assert os.path.exists(file_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_private_repository.py b/tests/hub/test_hub_private_repository.py
index b6e3536c..01a89586 100644
--- a/tests/hub/test_hub_private_repository.py
+++ b/tests/hub/test_hub_private_repository.py
@@ -5,6 +5,7 @@ import unittest
 import uuid
 
 from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.errors import GitError
 from modelscope.hub.repository import Repository
 
@@ -16,9 +17,6 @@ model_chinese_name = '达摩卡通化模型'
 model_org = 'unittest'
 DEFAULT_GIT_PATH = 'git'
 
-sample_model_url = 'https://mindscope.oss-cn-hangzhou.aliyuncs.com/test_models/mnist-12.onnx'
-download_model_file_name = 'mnist-12.onnx'
-
 
 class HubPrivateRepositoryTest(unittest.TestCase):
 
@@ -31,9 +29,10 @@ class HubPrivateRepositoryTest(unittest.TestCase):
         self.model_id = '%s/%s' % (model_org, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
+            visibility=ModelVisibility.PRIVATE,  # 1-private, 5-public
+            license=Licenses.APACHE_V2,
             chinese_name=model_chinese_name,
-            visibility=1,  # 1-private, 5-public
-            license='apache-2.0')
+        )
 
     def tearDown(self):
         self.api.login(USER_NAME, PASSWORD)
diff --git a/tests/hub/test_hub_repository.py b/tests/hub/test_hub_repository.py
index 7b1cc751..99f63eca 100644
--- a/tests/hub/test_hub_repository.py
+++ b/tests/hub/test_hub_repository.py
@@ -2,7 +2,6 @@
 import os
 import shutil
 import tempfile
-import time
 import unittest
 import uuid
 from os.path import expanduser
@@ -10,6 +9,7 @@ from os.path import expanduser
 from requests import delete
 
 from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.errors import NotExistError
 from modelscope.hub.file_download import model_file_download
 from modelscope.hub.repository import Repository
@@ -55,9 +55,10 @@ class HubRepositoryTest(unittest.TestCase):
         self.model_id = '%s/%s' % (model_org, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
+            visibility=ModelVisibility.PUBLIC,  # 1-private, 5-public
+            license=Licenses.APACHE_V2,
             chinese_name=model_chinese_name,
-            visibility=5,  # 1-private, 5-public
-            license='apache-2.0')
+        )
         temporary_dir = tempfile.mkdtemp()
         self.model_dir = os.path.join(temporary_dir, self.model_name)
 
@@ -81,27 +82,12 @@ class HubRepositoryTest(unittest.TestCase):
         os.chdir(self.model_dir)
         os.system("echo '111'>%s" % os.path.join(self.model_dir, 'add1.py'))
         os.system("echo '222'>%s" % os.path.join(self.model_dir, 'add2.py'))
-        repo.push('test', all_files=True)
+        repo.push('test')
         add1 = model_file_download(self.model_id, 'add1.py')
         assert os.path.exists(add1)
         add2 = model_file_download(self.model_id, 'add2.py')
         assert os.path.exists(add2)
 
-    def test_push_files(self):
-        repo = Repository(self.model_dir, clone_from=self.model_id)
-        assert os.path.exists(os.path.join(self.model_dir, 'README.md'))
-        os.system("echo '111'>%s" % os.path.join(self.model_dir, 'add1.py'))
-        os.system("echo '222'>%s" % os.path.join(self.model_dir, 'add2.py'))
-        os.system("echo '333'>%s" % os.path.join(self.model_dir, 'add3.py'))
-        repo.push('test', files=['add1.py', 'add2.py'], all_files=False)
-        add1 = model_file_download(self.model_id, 'add1.py')
-        assert os.path.exists(add1)
-        add2 = model_file_download(self.model_id, 'add2.py')
-        assert os.path.exists(add2)
-        with self.assertRaises(NotExistError) as cm:
-            model_file_download(self.model_id, 'add3.py')
-        print(cm.exception)
-
 
 if __name__ == '__main__':
     unittest.main()

From 1cb2fa850f2f9b468798b062bb4bd23065eeea88 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 28 Jun 2022 22:19:37 +0800
Subject: [PATCH 7/9] [to #42362425] update version with 0.2.1

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index df9144c5..fc79d63d 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = '0.2.1'

From 576b7cffb11532c3431fbfc2998ae833408c327b Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Wed, 29 Jun 2022 09:12:59 +0800
Subject: [PATCH 8/9] [to #42322933] add pipeline params for  preprocess and
 forward & zeroshot classification         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9180863

---
 modelscope/metainfo.py                        |  2 +
 modelscope/models/__init__.py                 |  3 +-
 modelscope/models/nlp/__init__.py             |  1 +
 .../nlp/sbert_for_zero_shot_classification.py | 50 ++++++++++
 modelscope/pipelines/base.py                  | 55 ++++++++---
 modelscope/pipelines/builder.py               |  3 +
 modelscope/pipelines/nlp/__init__.py          |  1 +
 .../nlp/zero_shot_classification_pipeline.py  | 97 +++++++++++++++++++
 modelscope/pipelines/outputs.py               |  7 ++
 modelscope/preprocessors/nlp.py               | 46 ++++++++-
 modelscope/utils/constant.py                  |  1 +
 .../test_zero_shot_classification.py          | 64 ++++++++++++
 12 files changed, 313 insertions(+), 17 deletions(-)
 create mode 100644 modelscope/models/nlp/sbert_for_zero_shot_classification.py
 create mode 100644 modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
 create mode 100644 tests/pipelines/test_zero_shot_classification.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index eda590ac..1d2ee4d2 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -52,6 +52,7 @@ class Pipelines(object):
     text_generation = 'text-generation'
     sentiment_analysis = 'sentiment-analysis'
     fill_mask = 'fill-mask'
+    zero_shot_classification = 'zero-shot-classification'
 
     # audio tasks
     sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
@@ -95,6 +96,7 @@ class Preprocessors(object):
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
+    zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py
index 816c44e2..f1074f68 100644
--- a/modelscope/models/__init__.py
+++ b/modelscope/models/__init__.py
@@ -7,4 +7,5 @@ from .audio.tts.vocoder import Hifigan16k
 from .base import Model
 from .builder import MODELS, build_model
 from .multi_modal import OfaForImageCaptioning
-from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
+from .nlp import (BertForSequenceClassification, SbertForSentenceSimilarity,
+                  SbertForZeroShotClassification)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 6be4493b..f904efdf 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -3,3 +3,4 @@ from .masked_language_model import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
+from .sbert_for_zero_shot_classification import *  # noqa F403
diff --git a/modelscope/models/nlp/sbert_for_zero_shot_classification.py b/modelscope/models/nlp/sbert_for_zero_shot_classification.py
new file mode 100644
index 00000000..837bb41e
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_zero_shot_classification.py
@@ -0,0 +1,50 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.utils.constant import Tasks
+from ...metainfo import Models
+from ..base import Model
+from ..builder import MODELS
+
+__all__ = ['SbertForZeroShotClassification']
+
+
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForZeroShotClassification(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the zero shot classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from sofa import SbertForSequenceClassification
+        self.model = SbertForSequenceClassification.from_pretrained(model_dir)
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+                Example:
+                    {
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                    }
+        """
+        outputs = self.model(**input)
+        logits = outputs['logits'].numpy()
+        res = {'logits': logits}
+        return res
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 2f5d5dcc..4052d35a 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -74,33 +74,57 @@ class Pipeline(ABC):
         self.preprocessor = preprocessor
 
     def __call__(self, input: Union[Input, List[Input]], *args,
-                 **post_kwargs) -> Union[Dict[str, Any], Generator]:
+                 **kwargs) -> Union[Dict[str, Any], Generator]:
         # model provider should leave it as it is
         # modelscope library developer will handle this function
 
         # simple showcase, need to support iterator type for both tensorflow and pytorch
         # input_dict = self._handle_input(input)
+
+        # sanitize the parameters
+        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(
+            **kwargs)
+        kwargs['preprocess_params'] = preprocess_params
+        kwargs['forward_params'] = forward_params
+        kwargs['postprocess_params'] = postprocess_params
+
         if isinstance(input, list):
             output = []
             for ele in input:
-                output.append(self._process_single(ele, *args, **post_kwargs))
+                output.append(self._process_single(ele, *args, **kwargs))
 
         elif isinstance(input, MsDataset):
-            return self._process_iterator(input, *args, **post_kwargs)
+            return self._process_iterator(input, *args, **kwargs)
 
         else:
-            output = self._process_single(input, *args, **post_kwargs)
+            output = self._process_single(input, *args, **kwargs)
         return output
 
-    def _process_iterator(self, input: Input, *args, **post_kwargs):
+    def _sanitize_parameters(self, **pipeline_parameters):
+        """
+        this method should sanitize the keyword args to preprocessor params,
+        forward params and postprocess params on '__call__' or '_process_single' method
+        considered to be a normal classmethod with default implementation / output
+
+        Default Returns:
+            Dict[str, str]:  preprocess_params = {}
+            Dict[str, str]:  forward_params = {}
+            Dict[str, str]:  postprocess_params = pipeline_parameters
+        """
+        return {}, {}, pipeline_parameters
+
+    def _process_iterator(self, input: Input, *args, **kwargs):
         for ele in input:
-            yield self._process_single(ele, *args, **post_kwargs)
+            yield self._process_single(ele, *args, **kwargs)
+
+    def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
+        preprocess_params = kwargs.get('preprocess_params')
+        forward_params = kwargs.get('forward_params')
+        postprocess_params = kwargs.get('postprocess_params')
 
-    def _process_single(self, input: Input, *args,
-                        **post_kwargs) -> Dict[str, Any]:
-        out = self.preprocess(input)
-        out = self.forward(out)
-        out = self.postprocess(out, **post_kwargs)
+        out = self.preprocess(input, **preprocess_params)
+        out = self.forward(out, **forward_params)
+        out = self.postprocess(out, **postprocess_params)
         self._check_output(out)
         return out
 
@@ -120,20 +144,21 @@ class Pipeline(ABC):
             raise ValueError(f'expected output keys are {output_keys}, '
                              f'those {missing_keys} are missing')
 
-    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         """ Provide default implementation based on preprocess_cfg and user can reimplement it
         """
         assert self.preprocessor is not None, 'preprocess method should be implemented'
         assert not isinstance(self.preprocessor, List),\
             'default implementation does not support using multiple preprocessors.'
-        return self.preprocessor(inputs)
+        return self.preprocessor(inputs, **preprocess_params)
 
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         """ Provide default implementation using self.model and user can reimplement it
         """
         assert self.model is not None, 'forward method should be implemented'
         assert not self.has_multiple_models, 'default implementation does not support multiple models in a pipeline.'
-        return self.model(inputs)
+        return self.model(inputs, **forward_params)
 
     @abstractmethod
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 41cd73da..847955d4 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -27,6 +27,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                 'damo/bert-base-sst2'),
     Tasks.text_generation: (Pipelines.text_generation,
                             'damo/nlp_palm2.0_text-generation_chinese-base'),
+    Tasks.zero_shot_classification:
+    (Pipelines.zero_shot_classification,
+     'damo/nlp_structbert_zero-shot-classification_chinese-base'),
     Tasks.image_captioning: (Pipelines.image_caption,
                              'damo/ofa_image-caption_coco_large_en'),
     Tasks.image_generation:
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index c50875fd..5ef12e22 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -3,3 +3,4 @@ from .sentence_similarity_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
 from .word_segmentation_pipeline import *  # noqa F403
+from .zero_shot_classification_pipeline import *  # noqa F403
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
new file mode 100644
index 00000000..2ed4dac3
--- /dev/null
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -0,0 +1,97 @@
+import os
+import uuid
+from typing import Any, Dict, Union
+
+import json
+import numpy as np
+import torch
+from scipy.special import softmax
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForZeroShotClassification
+from ...preprocessors import ZeroShotClassificationPreprocessor
+from ...utils.constant import Tasks
+from ..base import Input, Pipeline
+from ..builder import PIPELINES
+
+__all__ = ['ZeroShotClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.zero_shot_classification,
+    module_name=Pipelines.zero_shot_classification)
+class ZeroShotClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[SbertForZeroShotClassification, str],
+                 preprocessor: ZeroShotClassificationPreprocessor = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (SbertForSentimentClassification): a model instance
+            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
+            'model must be a single str or SbertForZeroShotClassification'
+        model = model if isinstance(
+            model,
+            SbertForZeroShotClassification) else Model.from_pretrained(model)
+
+        self.entailment_id = 0
+        self.contradiction_id = 2
+
+        if preprocessor is None:
+            preprocessor = ZeroShotClassificationPreprocessor(model.model_dir)
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        postprocess_params = {}
+
+        if 'candidate_labels' in kwargs:
+            candidate_labels = kwargs.pop('candidate_labels')
+            preprocess_params['candidate_labels'] = candidate_labels
+            postprocess_params['candidate_labels'] = candidate_labels
+        else:
+            raise ValueError('You must include at least one label.')
+        preprocess_params['hypothesis_template'] = kwargs.pop(
+            'hypothesis_template', '{}')
+
+        postprocess_params['multi_label'] = kwargs.pop('multi_label', False)
+        return preprocess_params, {}, postprocess_params
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    candidate_labels,
+                    multi_label=False) -> Dict[str, Any]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the prediction results
+        """
+
+        logits = inputs['logits']
+        if multi_label or len(candidate_labels) == 1:
+            logits = logits[..., [self.contradiction_id, self.entailment_id]]
+            scores = softmax(logits, axis=-1)[..., 1]
+        else:
+            logits = logits[..., self.entailment_id]
+            scores = softmax(logits, axis=-1)
+
+        reversed_index = list(reversed(scores.argsort()))
+        result = {
+            'labels': [candidate_labels[i] for i in reversed_index],
+            'scores': [scores[i].item() for i in reversed_index],
+        }
+        return result
diff --git a/modelscope/pipelines/outputs.py b/modelscope/pipelines/outputs.py
index 52b7eeae..290e6717 100644
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -101,6 +101,13 @@ TASK_OUTPUTS = {
     #   }
     Tasks.sentence_similarity: ['scores', 'labels'],
 
+    # zero-shot classification result for single sample
+    #   {
+    #       "labels": ["happy", "sad", "calm", "angry"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.zero_shot_classification: ['scores', 'labels'],
+
     # ============ audio tasks ===================
 
     # audio processed for single file in PCM format
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 4ed63f3c..e8e33e74 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -14,7 +14,7 @@ from .builder import PREPROCESSORS
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor',
-    'FillMaskPreprocessor'
+    'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor'
 ]
 
 
@@ -286,3 +286,47 @@ class TokenClassifcationPreprocessor(Preprocessor):
             'attention_mask': attention_mask,
             'token_type_ids': token_type_ids
         }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        from sofa import SbertTokenizer
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, str)
+    def __call__(self, data: str, hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            return_tensors='pt',
+            truncation_strategy='only_first')
+        return features
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 55f015e8..44bd1dff 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -48,6 +48,7 @@ class Tasks(object):
     fill_mask = 'fill-mask'
     summarization = 'summarization'
     question_answering = 'question-answering'
+    zero_shot_classification = 'zero-shot-classification'
 
     # audio tasks
     auto_speech_recognition = 'auto-speech-recognition'
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
new file mode 100644
index 00000000..b76a6a86
--- /dev/null
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForZeroShotClassification
+from modelscope.pipelines import ZeroShotClassificationPipeline, pipeline
+from modelscope.preprocessors import ZeroShotClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ZeroShotClassificationTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+    sentence = '全新突破 解放军运20版空中加油机曝光'
+    labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
+    template = '这篇文章的标题是{}'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = ZeroShotClassificationPreprocessor(cache_path)
+        model = SbertForZeroShotClassification(cache_path, tokenizer=tokenizer)
+        pipeline1 = ZeroShotClassificationPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.zero_shot_classification,
+            model=model,
+            preprocessor=tokenizer)
+
+        print(
+            f'sentence: {self.sentence}\n'
+            f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}'
+        )
+        print()
+        print(
+            f'sentence: {self.sentence}\n'
+            f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}'
+        )
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = ZeroShotClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.zero_shot_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.zero_shot_classification, model=self.model_id)
+        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.zero_shot_classification)
+        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+
+
+if __name__ == '__main__':
+    unittest.main()

From fabea5604e5795ce5cd341090865cf409490b062 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Wed, 29 Jun 2022 11:08:34 +0800
Subject: [PATCH 9/9] [to #42322933] Add MPLUG model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加 MPLUG 模型的 visual question answering 任务 pipeline

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9182119
---
 data/test/images/image_mplug_vqa.jpg          |  3 +
 modelscope/metainfo.py                        |  3 +
 modelscope/models/multi_modal/__init__.py     |  2 +
 .../mplug_for_visual_question_answering.py    | 46 +++++++++++++
 modelscope/pipelines/builder.py               |  5 +-
 modelscope/pipelines/multi_modal/__init__.py  |  1 +
 .../visual_question_answering_pipeline.py     | 65 +++++++++++++++++++
 modelscope/preprocessors/__init__.py          |  2 +-
 modelscope/preprocessors/multi_modal.py       | 45 +++++++++++++
 modelscope/utils/constant.py                  |  1 +
 requirements/nlp.txt                          |  2 +-
 .../test_visual_question_answering.py         | 60 +++++++++++++++++
 12 files changed, 232 insertions(+), 3 deletions(-)
 create mode 100644 data/test/images/image_mplug_vqa.jpg
 create mode 100644 modelscope/models/multi_modal/mplug_for_visual_question_answering.py
 create mode 100644 modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
 create mode 100644 tests/pipelines/test_visual_question_answering.py

diff --git a/data/test/images/image_mplug_vqa.jpg b/data/test/images/image_mplug_vqa.jpg
new file mode 100644
index 00000000..57919471
--- /dev/null
+++ b/data/test/images/image_mplug_vqa.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b37b706885849037b5fa7fa44a3b78a6375f768d95ce46bfcb8e7329d038a692
+size 181725
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1d2ee4d2..485605bb 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -27,6 +27,7 @@ class Models(object):
     # multi-modal models
     ofa = 'ofa'
     clip = 'clip-multi-modal-embedding'
+    mplug = 'mplug'
 
 
 class Pipelines(object):
@@ -63,6 +64,7 @@ class Pipelines(object):
     # multi-modal tasks
     image_caption = 'image-caption'
     multi_modal_embedding = 'multi-modal-embedding'
+    visual_question_answering = 'visual-question-answering'
 
 
 class Trainers(object):
@@ -105,3 +107,4 @@ class Preprocessors(object):
 
     # multi-modal
     ofa_image_caption = 'ofa-image-caption'
+    mplug_visual_question_answering = 'mplug-visual-question-answering'
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 2e6cc3bf..4ed9809b 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -1,2 +1,4 @@
 from .clip.clip_model import CLIPForMultiModalEmbedding
 from .image_captioning_model import OfaForImageCaptioning
+from .mplug_for_visual_question_answering import \
+    MPlugForVisualQuestionAnswering
diff --git a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py b/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
new file mode 100644
index 00000000..2682c048
--- /dev/null
+++ b/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
@@ -0,0 +1,46 @@
+from typing import Dict
+
+from ...metainfo import Models
+from ...utils.constant import Tasks
+from ..base import Model, Tensor
+from ..builder import MODELS
+
+__all__ = ['MPlugForVisualQuestionAnswering']
+
+
+@MODELS.register_module(
+    Tasks.visual_question_answering, module_name=Models.mplug)
+class MPlugForVisualQuestionAnswering(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the mplug model from the `model_dir` path.
+        Args:
+            model_dir (str): the model path.
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from sofa.models.mplug import MPlugForVisualQuestionAnswering
+        self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir)
+        self.tokenizer = self.model.tokenizer
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]),
+                    }
+        """
+
+        return self.model(**input)[0]
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 847955d4..2f66682d 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -42,7 +42,10 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                'damo/cv_TAdaConv_action-recognition'),
     Tasks.multi_modal_embedding:
     (Pipelines.multi_modal_embedding,
-     'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding')
+     'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding'),
+    Tasks.visual_question_answering:
+    (Pipelines.visual_question_answering,
+     'damo/mplug_visual-question-answering_coco_large_en'),
 }
 
 
diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py
index 6c96d843..fdcada89 100644
--- a/modelscope/pipelines/multi_modal/__init__.py
+++ b/modelscope/pipelines/multi_modal/__init__.py
@@ -1,2 +1,3 @@
 from .image_captioning_pipeline import ImageCaptionPipeline
 from .multi_modal_embedding_pipeline import MultiModalEmbeddingPipeline
+from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
new file mode 100644
index 00000000..97c8cf7b
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -0,0 +1,65 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.multi_modal import MPlugForVisualQuestionAnswering
+from ...preprocessors import MPlugVisualQuestionAnsweringPreprocessor
+from ...utils.constant import Tasks
+from ..base import Pipeline, Tensor
+from ..builder import PIPELINES
+
+__all__ = ['VisualQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.visual_question_answering,
+    module_name=Pipelines.visual_question_answering)
+class VisualQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MPlugForVisualQuestionAnswering, str],
+                 preprocessor: Optional[
+                     MPlugVisualQuestionAnsweringPreprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a visual question answering pipeline for prediction
+
+        Args:
+            model (MPlugForVisualQuestionAnswering): a model instance
+            preprocessor (MPlugVisualQuestionAnsweringPreprocessor): a preprocessor instance
+        """
+        model = model if isinstance(
+            model,
+            MPlugForVisualQuestionAnswering) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
+                model.model_dir)
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.tokenizer = model.tokenizer
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
+                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
+                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+
+        pred_string = self.tokenizer.decode(inputs[0][0])
+        for _old, _new in replace_tokens_bert:
+            pred_string = pred_string.replace(_old, _new)
+        pred_string.strip()
+        return {'answer': pred_string}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 1bc06ce3..694688f6 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -6,6 +6,6 @@ from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
 from .image import LoadImage, load_image
 from .kws import WavToLists
-from .multi_modal import OfaImageCaptionPreprocessor
+from .multi_modal import *  # noqa F403
 from .nlp import *  # noqa F403
 from .text_to_speech import *  # noqa F403
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 7c8f0fab..1bc686eb 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -16,6 +16,7 @@ from .image import load_image
 
 __all__ = [
     'OfaImageCaptionPreprocessor',
+    'MPlugVisualQuestionAnsweringPreprocessor',
 ]
 
 
@@ -110,3 +111,47 @@ class OfaImageCaptionPreprocessor(Preprocessor):
             }
         }
         return sample
+
+
+@PREPROCESSORS.register_module(
+    Fields.multi_modal,
+    module_name=Preprocessors.mplug_visual_question_answering)
+class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via 'bert-base-uncased' tokenizer and configuration
+
+        """
+        super().__init__(*args, **kwargs)
+
+        # tokenizer
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+
+        # load configuration
+        from sofa.models.mplug import CONFIG_NAME, MPlugConfig
+        config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME))
+
+        # Initialize transform
+        from torchvision import transforms
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.patch_resize_transform = transforms.Compose([
+            transforms.Resize((config.image_res, config.image_res),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image, question = data['image'], data['question']
+        image = Image.open(image).convert('RGB') if isinstance(image,
+                                                               str) else image
+        image = self.patch_resize_transform(image)
+        image = torch.stack([image], dim=0)
+        question = self.tokenizer([question.lower()],
+                                  padding='longest',
+                                  return_tensors='pt')
+
+        return {'image': image, 'question': question, 'train': False}
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 44bd1dff..3ce3ab98 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -61,6 +61,7 @@ class Tasks(object):
     visual_grounding = 'visual-grounding'
     text_to_image_synthesis = 'text-to-image-synthesis'
     multi_modal_embedding = 'multi-modal-embedding'
+    visual_question_answering = 'visual-question-answering'
 
 
 class InputFields(object):
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 261b9ec5..574bf856 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1 +1 @@
-https://alinlp.alibaba-inc.com/pypi/sofa-1.0.3-py3-none-any.whl
+https://alinlp.alibaba-inc.com/pypi/sofa-1.0.4.1-py3-none-any.whl
diff --git a/tests/pipelines/test_visual_question_answering.py b/tests/pipelines/test_visual_question_answering.py
new file mode 100644
index 00000000..4577607e
--- /dev/null
+++ b/tests/pipelines/test_visual_question_answering.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering
+from modelscope.pipelines import VisualQuestionAnsweringPipeline, pipeline
+from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class VisualQuestionAnsweringTest(unittest.TestCase):
+    model_id = 'damo/mplug_visual-question-answering_coco_large_en'
+    input_vqa = {
+        'image': 'data/test/images/image_mplug_vqa.jpg',
+        'question': 'What is the woman doing?',
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = MPlugVisualQuestionAnsweringPreprocessor(cache_path)
+        model = MPlugForVisualQuestionAnswering(cache_path)
+        pipeline1 = VisualQuestionAnsweringPipeline(
+            model, preprocessor=preprocessor)
+        pipeline2 = pipeline(
+            Tasks.visual_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        print(f"question: {self.input_vqa['question']}")
+        print(f"pipeline1: {pipeline1(self.input_vqa)['answer']}")
+        print(f"pipeline2: {pipeline2(self.input_vqa)['answer']}")
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
+            model.model_dir)
+        pipeline_vqa = pipeline(
+            task=Tasks.visual_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_vqa(self.input_vqa))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_vqa = pipeline(
+            Tasks.visual_question_answering, model=self.model_id)
+        print(pipeline_vqa(self.input_vqa))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_vqa = pipeline(task=Tasks.visual_question_answering)
+        print(pipeline_vqa(self.input_vqa))
+
+
+if __name__ == '__main__':
+    unittest.main()