From 5da470fd5d8a8a91936a41b21ad6ab1ebb9f3ba0 Mon Sep 17 00:00:00 2001
From: "feiwu.yfw" <feiwu.yfw@alibaba-inc.com>
Date: Tue, 28 Jun 2022 20:40:57 +0800
Subject: [PATCH 1/6] [to #42791465, #42779255, #42777959,  #42757844,
 #42756050, #42746916, #42743595, #42791863] fix: fix msdataset

 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9174075

* fix msdataset
---
 modelscope/hub/errors.py                    | 15 ++++++
 modelscope/msdatasets/config.py             |  2 +-
 modelscope/msdatasets/ms_dataset.py         | 56 +++++++++++++++------
 modelscope/msdatasets/utils/ms_api.py       | 48 ++++++++++++------
 modelscope/utils/constant.py                | 10 +++-
 tests/msdatasets/test_ms_dataset.py         | 24 +++++----
 tests/pipelines/test_image_matting.py       |  3 +-
 tests/pipelines/test_text_classification.py |  8 ++-
 8 files changed, 121 insertions(+), 45 deletions(-)

diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index 4b39d6e3..d39036a0 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -32,3 +32,18 @@ def raise_on_error(rsp):
         return True
     else:
         raise RequestError(rsp['Message'])
+
+
+# TODO use raise_on_error instead if modelhub and datahub response have uniform structures,
+def datahub_raise_on_error(url, rsp):
+    """If response error, raise exception
+
+    Args:
+        rsp (_type_): The server response
+    """
+    if rsp.get('Code') == 200:
+        return True
+    else:
+        raise RequestError(
+            f"Url = {url}, Status = {rsp.get('status')}, error = {rsp.get('error')}, message = {rsp.get('message')}"
+        )
diff --git a/modelscope/msdatasets/config.py b/modelscope/msdatasets/config.py
index e916b3ec..00c24c3a 100644
--- a/modelscope/msdatasets/config.py
+++ b/modelscope/msdatasets/config.py
@@ -19,4 +19,4 @@ DOWNLOADED_DATASETS_PATH = Path(
     os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))
 
 MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
-                                 'http://101.201.119.157:31752')
+                                 'http://123.57.189.90:31752')
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 0466894c..90964b36 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -3,7 +3,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
 import numpy as np
-from datasets import Dataset
+from datasets import Dataset, DatasetDict
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
@@ -12,7 +12,7 @@ from datasets.utils.file_utils import (is_relative_path,
 
 from modelscope.msdatasets.config import MS_DATASETS_CACHE
 from modelscope.msdatasets.utils.ms_api import MsApi
-from modelscope.utils.constant import Hubs
+from modelscope.utils.constant import DownloadMode, Hubs
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -34,6 +34,10 @@ class MsDataset:
 
     def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
         self._hf_ds = hf_ds
+        if target is not None and target not in self._hf_ds.features:
+            raise TypeError(
+                f'"target" must be a column of the dataset({list(self._hf_ds.features.keys())}, but got {target}'
+            )
         self.target = target
 
     def __iter__(self):
@@ -48,17 +52,23 @@ class MsDataset:
 
     @classmethod
     def from_hf_dataset(cls,
-                        hf_ds: Dataset,
+                        hf_ds: Union[Dataset, DatasetDict],
                         target: str = None) -> Union[dict, 'MsDataset']:
         if isinstance(hf_ds, Dataset):
             return cls(hf_ds, target)
-        if len(hf_ds.keys()) == 1:
-            return cls(next(iter(hf_ds.values())), target)
-        return {k: cls(v, target) for k, v in hf_ds.items()}
+        elif isinstance(hf_ds, DatasetDict):
+            if len(hf_ds.keys()) == 1:
+                return cls(next(iter(hf_ds.values())), target)
+            return {k: cls(v, target) for k, v in hf_ds.items()}
+        else:
+            raise TypeError(
+                f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
+            )
 
     @staticmethod
     def load(
         dataset_name: Union[str, list],
+        namespace: Optional[str] = None,
         target: Optional[str] = None,
         version: Optional[str] = None,
         hub: Optional[Hubs] = Hubs.modelscope,
@@ -67,23 +77,32 @@ class MsDataset:
         data_dir: Optional[str] = None,
         data_files: Optional[Union[str, Sequence[str],
                                    Mapping[str, Union[str,
-                                                      Sequence[str]]]]] = None
+                                                      Sequence[str]]]]] = None,
+        download_mode: Optional[DownloadMode] = DownloadMode.
+        REUSE_DATASET_IF_EXISTS
     ) -> Union[dict, 'MsDataset']:
         """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
             Args:
 
                 dataset_name (str): Path or name of the dataset.
+                namespace(str, optional): Namespace of the dataset. It should not be None, if you load a remote dataset
+                from Hubs.modelscope,
                 target (str, optional): Name of the column to output.
                 version (str, optional): Version of the dataset script to load:
                 subset_name (str, optional): Defining the subset_name of the dataset.
                 data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                 data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                 split (str, optional): Which split of the data to load.
-                hub (Hubs, optional): When loading from a remote hub, where it is from
+                hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
+                download_mode (DownloadMode or str, optional): How to treat existing datasets. default
+                                                               DownloadMode.REUSE_DATASET_IF_EXISTS
 
             Returns:
                 MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
             """
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        hub = Hubs(hub or Hubs.modelscope)
         if hub == Hubs.huggingface:
             dataset = hf_load_dataset(
                 dataset_name,
@@ -91,21 +110,25 @@ class MsDataset:
                 revision=version,
                 split=split,
                 data_dir=data_dir,
-                data_files=data_files)
+                data_files=data_files,
+                download_mode=download_mode.value)
             return MsDataset.from_hf_dataset(dataset, target=target)
-        else:
+        elif hub == Hubs.modelscope:
             return MsDataset._load_ms_dataset(
                 dataset_name,
+                namespace=namespace,
                 target=target,
                 subset_name=subset_name,
                 version=version,
                 split=split,
                 data_dir=data_dir,
-                data_files=data_files)
+                data_files=data_files,
+                download_mode=download_mode)
 
     @staticmethod
     def _load_ms_dataset(
         dataset_name: Union[str, list],
+        namespace: Optional[str] = None,
         target: Optional[str] = None,
         version: Optional[str] = None,
         subset_name: Optional[str] = None,
@@ -113,17 +136,19 @@ class MsDataset:
         data_dir: Optional[str] = None,
         data_files: Optional[Union[str, Sequence[str],
                                    Mapping[str, Union[str,
-                                                      Sequence[str]]]]] = None
+                                                      Sequence[str]]]]] = None,
+        download_mode: Optional[DownloadMode] = None
     ) -> Union[dict, 'MsDataset']:
         if isinstance(dataset_name, str):
             use_hf = False
             if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
                     (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
                 use_hf = True
-            elif is_relative_path(dataset_name):
+            elif is_relative_path(dataset_name) and dataset_name.count(
+                    '/') == 0:
                 ms_api = MsApi()
                 dataset_scripts = ms_api.fetch_dataset_scripts(
-                    dataset_name, version)
+                    dataset_name, namespace, download_mode, version)
                 if 'py' in dataset_scripts:  # dataset copied from hf datasets
                     dataset_name = dataset_scripts['py'][0]
                     use_hf = True
@@ -140,7 +165,8 @@ class MsDataset:
                     split=split,
                     data_dir=data_dir,
                     data_files=data_files,
-                    cache_dir=MS_DATASETS_CACHE)
+                    cache_dir=MS_DATASETS_CACHE,
+                    download_mode=download_mode.value)
             else:
                 # TODO load from ms datahub
                 raise NotImplementedError(
diff --git a/modelscope/msdatasets/utils/ms_api.py b/modelscope/msdatasets/utils/ms_api.py
index fc3bcca2..c9b49ca1 100644
--- a/modelscope/msdatasets/utils/ms_api.py
+++ b/modelscope/msdatasets/utils/ms_api.py
@@ -1,11 +1,14 @@
 import os
+import shutil
 from collections import defaultdict
 from typing import Optional
 
 import requests
 
+from modelscope.hub.errors import NotExistError, datahub_raise_on_error
 from modelscope.msdatasets.config import (DOWNLOADED_DATASETS_PATH,
                                           MS_HUB_ENDPOINT)
+from modelscope.utils.constant import DownloadMode
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,23 +30,38 @@ class MsApi:
 
     def fetch_dataset_scripts(self,
                               dataset_name: str,
-                              version: Optional[str] = 'master',
-                              force_download=False):
-        datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
-        r = requests.get(datahub_url)
-        r.raise_for_status()
-        dataset_list = r.json()['Data']
-        if len(dataset_list) == 0:
-            return None
-        dataset_id = dataset_list[0]['Id']
+                              namespace: str,
+                              download_mode: Optional[DownloadMode],
+                              version: Optional[str] = 'master'):
+        if namespace is None:
+            raise ValueError(
+                f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
+            )
         version = version or 'master'
-        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
-        r = requests.get(datahub_url)
-        r.raise_for_status()
-        file_list = r.json()['Data']['Files']
         cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
-                                 version)
+                                 namespace, version)
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
+                cache_dir):
+            shutil.rmtree(cache_dir)
         os.makedirs(cache_dir, exist_ok=True)
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
+        r = requests.get(datahub_url)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp)
+        dataset_id = resp['Data']['Id']
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
+        r = requests.get(datahub_url)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp)
+        file_list = resp['Data']
+        if file_list is None:
+            raise NotExistError(
+                f'The modelscope dataset [dataset_name = {dataset_name}, namespace = {namespace}, '
+                f'version = {version}] dose not exist')
+
+        file_list = file_list['Files']
         local_paths = defaultdict(list)
         for file_info in file_list:
             file_path = file_info['Path']
@@ -54,7 +72,7 @@ class MsApi:
                 r.raise_for_status()
                 content = r.json()['Data']['Content']
                 local_path = os.path.join(cache_dir, file_path)
-                if os.path.exists(local_path) and not force_download:
+                if os.path.exists(local_path):
                     logger.warning(
                         f"Reusing dataset {dataset_name}'s python file ({local_path})"
                     )
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index f2215359..55f015e8 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import enum
 
 
 class Fields(object):
@@ -69,13 +70,20 @@ class InputFields(object):
     audio = 'audio'
 
 
-class Hubs(object):
+class Hubs(enum.Enum):
     """ Source from which an entity (such as a Dataset or Model) is stored
     """
     modelscope = 'modelscope'
     huggingface = 'huggingface'
 
 
+class DownloadMode(enum.Enum):
+    """ How to treat existing datasets
+    """
+    REUSE_DATASET_IF_EXISTS = 'reuse_dataset_if_exists'
+    FORCE_REDOWNLOAD = 'force_redownload'
+
+
 class ModelFile(object):
     CONFIGURATION = 'configuration.json'
     README = 'README.md'
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index de413d5f..50767fd8 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -32,11 +32,12 @@ class ImgPreprocessor(Preprocessor):
 
 class MsDatasetTest(unittest.TestCase):
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_ds_basic(self):
-        ms_ds_full = MsDataset.load('squad')
+        ms_ds_full = MsDataset.load('squad', namespace='damotest')
         ms_ds_full_hf = hfdata.load_dataset('squad')
-        ms_ds_train = MsDataset.load('squad', split='train')
+        ms_ds_train = MsDataset.load(
+            'squad', namespace='damotest', split='train')
         ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
         ms_image_train = MsDataset.from_hf_dataset(
             hfdata.load_dataset('beans', split='train'))
@@ -48,7 +49,7 @@ class MsDatasetTest(unittest.TestCase):
         print(next(iter(ms_ds_train)))
         print(next(iter(ms_image_train)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     @require_torch
     def test_to_torch_dataset_text(self):
         model_id = 'damo/bert-base-sst2'
@@ -57,13 +58,14 @@ class MsDatasetTest(unittest.TestCase):
             nlp_model.model_dir,
             first_sequence='context',
             second_sequence=None)
-        ms_ds_train = MsDataset.load('squad', split='train')
+        ms_ds_train = MsDataset.load(
+            'squad', namespace='damotest', split='train')
         pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
         import torch
         dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
         print(next(iter(dataloader)))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     @require_tf
     def test_to_tf_dataset_text(self):
         import tensorflow as tf
@@ -74,7 +76,8 @@ class MsDatasetTest(unittest.TestCase):
             nlp_model.model_dir,
             first_sequence='context',
             second_sequence=None)
-        ms_ds_train = MsDataset.load('squad', split='train')
+        ms_ds_train = MsDataset.load(
+            'squad', namespace='damotest', split='train')
         tf_dataset = ms_ds_train.to_tf_dataset(
             batch_size=5,
             shuffle=True,
@@ -85,8 +88,8 @@ class MsDatasetTest(unittest.TestCase):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     @require_torch
     def test_to_torch_dataset_img(self):
-        ms_image_train = MsDataset.from_hf_dataset(
-            hfdata.load_dataset('beans', split='train'))
+        ms_image_train = MsDataset.load(
+            'beans', namespace='damotest', split='train')
         pt_dataset = ms_image_train.to_torch_dataset(
             preprocessors=ImgPreprocessor(
                 image_path='image_file_path', label='labels'))
@@ -99,7 +102,8 @@ class MsDatasetTest(unittest.TestCase):
     def test_to_tf_dataset_img(self):
         import tensorflow as tf
         tf.compat.v1.enable_eager_execution()
-        ms_image_train = MsDataset.load('beans', split='train')
+        ms_image_train = MsDataset.load(
+            'beans', namespace='damotest', split='train')
         tf_dataset = ms_image_train.to_tf_dataset(
             batch_size=5,
             shuffle=True,
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index de60ff0b..48a715f1 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -62,7 +62,8 @@ class ImageMattingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_modelscope_dataset(self):
-        dataset = MsDataset.load('beans', split='train', target='image')
+        dataset = MsDataset.load(
+            'beans', namespace='damotest', split='train', target='image')
         img_matting = pipeline(Tasks.image_matting, model=self.model_id)
         result = img_matting(dataset)
         for i in range(10):
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index f913490c..1bf9f7ca 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -87,12 +87,16 @@ class SequenceClassificationTest(unittest.TestCase):
         result = text_classification(dataset)
         self.printDataset(result)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_modelscope_dataset(self):
         text_classification = pipeline(task=Tasks.text_classification)
         # loaded from modelscope dataset
         dataset = MsDataset.load(
-            'squad', split='train', target='context', hub=Hubs.modelscope)
+            'squad',
+            namespace='damotest',
+            split='train',
+            target='context',
+            hub=Hubs.modelscope)
         result = text_classification(dataset)
         self.printDataset(result)
 

From 0d17eb5b395b0d1a74e1a10ad754843bd6dfc71b Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Tue, 28 Jun 2022 21:12:15 +0800
Subject: [PATCH 2/6] [to #42849800 #42822853 #42822836 #42822791 #42822717
 #42820011]fix: bug test bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复测试bug
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9186775

    * [to #42849800 #42822853 #42822836 #42822791 #42822717 #42820011]fix: test bugs
---
 modelscope/hub/api.py                    | 84 ++++++++++++++++-------
 modelscope/hub/errors.py                 |  4 ++
 modelscope/hub/file_download.py          | 16 +++--
 modelscope/hub/git.py                    |  8 +++
 modelscope/hub/repository.py             | 12 ++--
 modelscope/hub/snapshot_download.py      | 16 ++---
 modelscope/hub/utils/caching.py          |  8 ++-
 modelscope/utils/hub.py                  |  5 +-
 tests/hub/test_hub_operation.py          | 42 ++++++++++--
 tests/hub/test_hub_private_files.py      | 85 ++++++++++++++++++++++++
 tests/hub/test_hub_private_repository.py |  9 ++-
 tests/hub/test_hub_repository.py         | 24 ++-----
 12 files changed, 235 insertions(+), 78 deletions(-)
 create mode 100644 tests/hub/test_hub_private_files.py

diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index d102219b..e79bfd41 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -9,7 +9,7 @@ import requests
 
 from modelscope.utils.logger import get_logger
 from .constants import MODELSCOPE_URL_SCHEME
-from .errors import NotExistError, is_ok, raise_on_error
+from .errors import InvalidParameter, NotExistError, is_ok, raise_on_error
 from .utils.utils import (get_endpoint, get_gitlab_domain,
                           model_id_to_group_owner_name)
 
@@ -61,17 +61,21 @@ class HubApi:
 
         return d['Data']['AccessToken'], cookies
 
-    def create_model(self, model_id: str, chinese_name: str, visibility: int,
-                     license: str) -> str:
+    def create_model(
+        self,
+        model_id: str,
+        visibility: str,
+        license: str,
+        chinese_name: Optional[str] = None,
+    ) -> str:
         """
         Create model repo at ModelScopeHub
 
         Args:
             model_id:(`str`): The model id
-            chinese_name(`str`): chinese name of the model
-            visibility(`int`): visibility of the model(1-private, 3-internal, 5-public)
-            license(`str`): license of the model, candidates can be found at: TBA
-
+            visibility(`int`): visibility of the model(1-private, 5-public), default public.
+            license(`str`): license of the model, default none.
+            chinese_name(`str`, *optional*): chinese name of the model
         Returns:
             name of the model created
 
@@ -79,6 +83,8 @@ class HubApi:
             model_id = {owner}/{name}
         </Tip>
         """
+        if model_id is None:
+            raise InvalidParameter('model_id is required!')
         cookies = ModelScopeConfig.get_cookies()
         if cookies is None:
             raise ValueError('Token does not exist, please login first.')
@@ -151,11 +157,33 @@ class HubApi:
         else:
             r.raise_for_status()
 
+    def _check_cookie(self,
+                      use_cookies: Union[bool,
+                                         CookieJar] = False) -> CookieJar:
+        cookies = None
+        if isinstance(use_cookies, CookieJar):
+            cookies = use_cookies
+        elif use_cookies:
+            cookies = ModelScopeConfig.get_cookies()
+            if cookies is None:
+                raise ValueError('Token does not exist, please login first.')
+        return cookies
+
     def get_model_branches_and_tags(
         self,
         model_id: str,
+        use_cookies: Union[bool, CookieJar] = False
     ) -> Tuple[List[str], List[str]]:
-        cookies = ModelScopeConfig.get_cookies()
+        """Get model branch and tags.
+
+        Args:
+            model_id (str): The model id
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
+                        will load cookie from local. Defaults to False.
+        Returns:
+            Tuple[List[str], List[str]]: _description_
+        """
+        cookies = self._check_cookie(use_cookies)
 
         path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
         r = requests.get(path, cookies=cookies)
@@ -169,23 +197,33 @@ class HubApi:
                 ] if info['RevisionMap']['Tags'] else []
         return branches, tags
 
-    def get_model_files(
-            self,
-            model_id: str,
-            revision: Optional[str] = 'master',
-            root: Optional[str] = None,
-            recursive: Optional[str] = False,
-            use_cookies: Union[bool, CookieJar] = False) -> List[dict]:
+    def get_model_files(self,
+                        model_id: str,
+                        revision: Optional[str] = 'master',
+                        root: Optional[str] = None,
+                        recursive: Optional[str] = False,
+                        use_cookies: Union[bool, CookieJar] = False,
+                        is_snapshot: Optional[bool] = True) -> List[dict]:
+        """List the models files.
 
-        cookies = None
-        if isinstance(use_cookies, CookieJar):
-            cookies = use_cookies
-        elif use_cookies:
-            cookies = ModelScopeConfig.get_cookies()
-            if cookies is None:
-                raise ValueError('Token does not exist, please login first.')
+        Args:
+            model_id (str): The model id
+            revision (Optional[str], optional): The branch or tag name. Defaults to 'master'.
+            root (Optional[str], optional): The root path. Defaults to None.
+            recursive (Optional[str], optional): Is recurive list files. Defaults to False.
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
+                        will load cookie from local. Defaults to False.
+            is_snapshot(Optional[bool], optional): when snapshot_download set to True, otherwise False.
 
-        path = f'{self.endpoint}/api/v1/models/{model_id}/repo/files?Revision={revision}&Recursive={recursive}'
+        Raises:
+            ValueError: If user_cookies is True, but no local cookie.
+
+        Returns:
+            List[dict]: Model file list.
+        """
+        path = '%s/api/v1/models/%s/repo/files?Revision=%s&Recursive=%s&Snapshot=%s' % (
+            self.endpoint, model_id, revision, recursive, is_snapshot)
+        cookies = self._check_cookie(use_cookies)
         if root is not None:
             path = path + f'&Root={root}'
 
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index d39036a0..9a19fdb5 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -10,6 +10,10 @@ class GitError(Exception):
     pass
 
 
+class InvalidParameter(Exception):
+    pass
+
+
 def is_ok(rsp):
     """ Check the request is ok
 
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index b92bf89c..60aae3b6 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -7,6 +7,7 @@ import tempfile
 import time
 from functools import partial
 from hashlib import sha256
+from http.cookiejar import CookieJar
 from pathlib import Path
 from typing import BinaryIO, Dict, Optional, Union
 from uuid import uuid4
@@ -107,7 +108,9 @@ def model_file_download(
 
     _api = HubApi()
     headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
-    branches, tags = _api.get_model_branches_and_tags(model_id)
+    cookies = ModelScopeConfig.get_cookies()
+    branches, tags = _api.get_model_branches_and_tags(
+        model_id, use_cookies=False if cookies is None else cookies)
     file_to_download_info = None
     is_commit_id = False
     if revision in branches or revision in tags:  # The revision is version or tag,
@@ -117,18 +120,19 @@ def model_file_download(
             model_id=model_id,
             revision=revision,
             recursive=True,
-        )
+            use_cookies=False if cookies is None else cookies,
+            is_snapshot=False)
 
         for model_file in model_files:
             if model_file['Type'] == 'tree':
                 continue
 
             if model_file['Path'] == file_path:
-                model_file['Branch'] = revision
                 if cache.exists(model_file):
                     return cache.get_file_by_info(model_file)
                 else:
                     file_to_download_info = model_file
+                break
 
         if file_to_download_info is None:
             raise NotExistError('The file path: %s not exist in: %s' %
@@ -141,8 +145,6 @@ def model_file_download(
             return cached_file_path  # the file is in cache.
         is_commit_id = True
     # we need to download again
-    # TODO: skip using JWT for authorization, use cookie instead
-    cookies = ModelScopeConfig.get_cookies()
     url_to_download = get_file_download_url(model_id, file_path, revision)
     file_to_download_info = {
         'Path': file_path,
@@ -202,7 +204,7 @@ def http_get_file(
     url: str,
     local_dir: str,
     file_name: str,
-    cookies: Dict[str, str],
+    cookies: CookieJar,
     headers: Optional[Dict[str, str]] = None,
 ):
     """
@@ -217,7 +219,7 @@ def http_get_file(
             local directory where the downloaded file stores
         file_name(`str`):
             name of the file stored in `local_dir`
-        cookies(`Dict[str, str]`):
+        cookies(`CookieJar`):
             cookies used to authentication the user, which is used for downloading private repos
         headers(`Optional[Dict[str, str]] = None`):
             http headers to carry necessary info when requesting the remote file
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 37f61814..54161f1c 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -70,6 +70,14 @@ class GitCommandWrapper(metaclass=Singleton):
         except GitError:
             return False
 
+    def git_lfs_install(self, repo_dir):
+        cmd = ['git', '-C', repo_dir, 'lfs', 'install']
+        try:
+            self._run_git_command(*cmd)
+            return True
+        except GitError:
+            return False
+
     def clone(self,
               repo_base_dir: str,
               token: str,
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index d9322144..37dec571 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -1,7 +1,7 @@
 import os
 from typing import List, Optional
 
-from modelscope.hub.errors import GitError
+from modelscope.hub.errors import GitError, InvalidParameter
 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .constants import MODELSCOPE_URL_SCHEME
@@ -49,6 +49,8 @@ class Repository:
         git_wrapper = GitCommandWrapper()
         if not git_wrapper.is_lfs_installed():
             logger.error('git lfs is not installed, please install.')
+        else:
+            git_wrapper.git_lfs_install(self.model_dir)  # init repo lfs
 
         self.git_wrapper = GitCommandWrapper(git_path)
         os.makedirs(self.model_dir, exist_ok=True)
@@ -74,8 +76,6 @@ class Repository:
 
     def push(self,
              commit_message: str,
-             files: List[str] = list(),
-             all_files: bool = False,
              branch: Optional[str] = 'master',
              force: bool = False):
         """Push local to remote, this method will do.
@@ -86,8 +86,12 @@ class Repository:
             commit_message (str): commit message
             revision (Optional[str], optional): which branch to push. Defaults to 'master'.
         """
+        if commit_message is None:
+            msg = 'commit_message must be provided!'
+            raise InvalidParameter(msg)
         url = self.git_wrapper.get_repo_remote_url(self.model_dir)
-        self.git_wrapper.add(self.model_dir, files, all_files)
+        self.git_wrapper.pull(self.model_dir)
+        self.git_wrapper.add(self.model_dir, all_files=True)
         self.git_wrapper.commit(self.model_dir, commit_message)
         self.git_wrapper.push(
             repo_dir=self.model_dir,
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index 90d850f4..91463f76 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -20,8 +20,7 @@ def snapshot_download(model_id: str,
                       revision: Optional[str] = 'master',
                       cache_dir: Union[str, Path, None] = None,
                       user_agent: Optional[Union[Dict, str]] = None,
-                      local_files_only: Optional[bool] = False,
-                      private: Optional[bool] = False) -> str:
+                      local_files_only: Optional[bool] = False) -> str:
     """Download all files of a repo.
     Downloads a whole snapshot of a repo's files at the specified revision. This
     is useful when you want all files from a repo, because you don't know which
@@ -79,8 +78,10 @@ def snapshot_download(model_id: str,
         # make headers
         headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
         _api = HubApi()
+        cookies = ModelScopeConfig.get_cookies()
         # get file list from model repo
-        branches, tags = _api.get_model_branches_and_tags(model_id)
+        branches, tags = _api.get_model_branches_and_tags(
+            model_id, use_cookies=False if cookies is None else cookies)
         if revision not in branches and revision not in tags:
             raise NotExistError('The specified branch or tag : %s not exist!'
                                 % revision)
@@ -89,11 +90,8 @@ def snapshot_download(model_id: str,
             model_id=model_id,
             revision=revision,
             recursive=True,
-            use_cookies=private)
-
-        cookies = None
-        if private:
-            cookies = ModelScopeConfig.get_cookies()
+            use_cookies=False if cookies is None else cookies,
+            is_snapshot=True)
 
         for model_file in model_files:
             if model_file['Type'] == 'tree':
@@ -116,7 +114,7 @@ def snapshot_download(model_id: str,
                 local_dir=tempfile.gettempdir(),
                 file_name=model_file['Name'],
                 headers=headers,
-                cookies=None if cookies is None else cookies.get_dict())
+                cookies=cookies)
             # put file to cache
             cache.put_file(
                 model_file,
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index ac258385..7675e49b 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -101,8 +101,9 @@ class FileSystemCache(object):
         Args:
             key (dict): The cache key.
         """
-        self.cached_files.remove(key)
-        self.save_cached_files()
+        if key in self.cached_files:
+            self.cached_files.remove(key)
+            self.save_cached_files()
 
     def exists(self, key):
         for cache_file in self.cached_files:
@@ -204,6 +205,7 @@ class ModelFileSystemCache(FileSystemCache):
                     return orig_path
                 else:
                     self.remove_key(cached_file)
+                    break
 
         return None
 
@@ -230,6 +232,7 @@ class ModelFileSystemCache(FileSystemCache):
                     cached_key['Revision'].startswith(key['Revision'])
                     or key['Revision'].startswith(cached_key['Revision'])):
                 is_exists = True
+                break
         file_path = os.path.join(self.cache_root_location,
                                  model_file_info['Path'])
         if is_exists:
@@ -253,6 +256,7 @@ class ModelFileSystemCache(FileSystemCache):
                                          cached_file['Path'])
                 if os.path.exists(file_path):
                     os.remove(file_path)
+                break
 
     def put_file(self, model_file_info, model_file_location):
         """Put model on model_file_location to cache, the model first download to /tmp, and move to cache.
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index c427b7a3..3b7e80ef 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -31,9 +31,10 @@ def create_model_if_not_exist(
     else:
         api.create_model(
             model_id=model_id,
-            chinese_name=chinese_name,
             visibility=visibility,
-            license=license)
+            license=license,
+            chinese_name=chinese_name,
+        )
         print(f'model {model_id} successfully created.')
         return True
 
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
index 035b183e..d193ce32 100644
--- a/tests/hub/test_hub_operation.py
+++ b/tests/hub/test_hub_operation.py
@@ -3,6 +3,7 @@ import os
 import tempfile
 import unittest
 import uuid
+from shutil import rmtree
 
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import Licenses, ModelVisibility
@@ -23,7 +24,6 @@ download_model_file_name = 'test.bin'
 class HubOperationTest(unittest.TestCase):
 
     def setUp(self):
-        self.old_cwd = os.getcwd()
         self.api = HubApi()
         # note this is temporary before official account management is ready
         self.api.login(USER_NAME, PASSWORD)
@@ -31,19 +31,18 @@ class HubOperationTest(unittest.TestCase):
         self.model_id = '%s/%s' % (model_org, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
-            chinese_name=model_chinese_name,
             visibility=ModelVisibility.PUBLIC,
-            license=Licenses.APACHE_V2)
+            license=Licenses.APACHE_V2,
+            chinese_name=model_chinese_name,
+        )
         temporary_dir = tempfile.mkdtemp()
         self.model_dir = os.path.join(temporary_dir, self.model_name)
         repo = Repository(self.model_dir, clone_from=self.model_id)
-        os.chdir(self.model_dir)
         os.system("echo 'testtest'>%s"
-                  % os.path.join(self.model_dir, 'test.bin'))
-        repo.push('add model', all_files=True)
+                  % os.path.join(self.model_dir, download_model_file_name))
+        repo.push('add model')
 
     def tearDown(self):
-        os.chdir(self.old_cwd)
         self.api.delete_model(model_id=self.model_id)
 
     def test_model_repo_creation(self):
@@ -79,6 +78,35 @@ class HubOperationTest(unittest.TestCase):
         mdtime2 = os.path.getmtime(downloaded_file_path)
         assert mdtime1 == mdtime2
 
+    def test_download_public_without_login(self):
+        rmtree(ModelScopeConfig.path_credential)
+        snapshot_path = snapshot_download(model_id=self.model_id)
+        downloaded_file_path = os.path.join(snapshot_path,
+                                            download_model_file_name)
+        assert os.path.exists(downloaded_file_path)
+        temporary_dir = tempfile.mkdtemp()
+        downloaded_file = model_file_download(
+            model_id=self.model_id,
+            file_path=download_model_file_name,
+            cache_dir=temporary_dir)
+        assert os.path.exists(downloaded_file)
+        self.api.login(USER_NAME, PASSWORD)
+
+    def test_snapshot_delete_download_cache_file(self):
+        snapshot_path = snapshot_download(model_id=self.model_id)
+        downloaded_file_path = os.path.join(snapshot_path,
+                                            download_model_file_name)
+        assert os.path.exists(downloaded_file_path)
+        os.remove(downloaded_file_path)
+        # download again in cache
+        file_download_path = model_file_download(
+            model_id=self.model_id, file_path='README.md')
+        assert os.path.exists(file_download_path)
+        # deleted file need download again
+        file_download_path = model_file_download(
+            model_id=self.model_id, file_path=download_model_file_name)
+        assert os.path.exists(file_download_path)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/hub/test_hub_private_files.py b/tests/hub/test_hub_private_files.py
new file mode 100644
index 00000000..b9c71456
--- /dev/null
+++ b/tests/hub/test_hub_private_files.py
@@ -0,0 +1,85 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import tempfile
+import unittest
+import uuid
+
+from requests.exceptions import HTTPError
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.errors import GitError
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.repository import Repository
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.utils.constant import ModelFile
+
+USER_NAME = 'maasadmin'
+PASSWORD = '12345678'
+USER_NAME2 = 'sdkdev'
+
+model_chinese_name = '达摩卡通化模型'
+model_org = 'unittest'
+
+
+class HubPrivateFileDownloadTest(unittest.TestCase):
+
+    def setUp(self):
+        self.old_cwd = os.getcwd()
+        self.api = HubApi()
+        # note this is temporary before official account management is ready
+        self.token, _ = self.api.login(USER_NAME, PASSWORD)
+        self.model_name = uuid.uuid4().hex
+        self.model_id = '%s/%s' % (model_org, self.model_name)
+        self.api.create_model(
+            model_id=self.model_id,
+            visibility=ModelVisibility.PRIVATE,  # 1-private, 5-public
+            license=Licenses.APACHE_V2,
+            chinese_name=model_chinese_name,
+        )
+
+    def tearDown(self):
+        os.chdir(self.old_cwd)
+        self.api.delete_model(model_id=self.model_id)
+
+    def test_snapshot_download_private_model(self):
+        snapshot_path = snapshot_download(self.model_id)
+        assert os.path.exists(os.path.join(snapshot_path, ModelFile.README))
+
+    def test_snapshot_download_private_model_no_permission(self):
+        self.token, _ = self.api.login(USER_NAME2, PASSWORD)
+        with self.assertRaises(HTTPError):
+            snapshot_download(self.model_id)
+        self.api.login(USER_NAME, PASSWORD)
+
+    def test_download_file_private_model(self):
+        file_path = model_file_download(self.model_id, ModelFile.README)
+        assert os.path.exists(file_path)
+
+    def test_download_file_private_model_no_permission(self):
+        self.token, _ = self.api.login(USER_NAME2, PASSWORD)
+        with self.assertRaises(HTTPError):
+            model_file_download(self.model_id, ModelFile.README)
+        self.api.login(USER_NAME, PASSWORD)
+
+    def test_snapshot_download_local_only(self):
+        with self.assertRaises(ValueError):
+            snapshot_download(self.model_id, local_files_only=True)
+        snapshot_path = snapshot_download(self.model_id)
+        assert os.path.exists(os.path.join(snapshot_path, ModelFile.README))
+        snapshot_path = snapshot_download(self.model_id, local_files_only=True)
+        assert os.path.exists(snapshot_path)
+
+    def test_file_download_local_only(self):
+        with self.assertRaises(ValueError):
+            model_file_download(
+                self.model_id, ModelFile.README, local_files_only=True)
+        file_path = model_file_download(self.model_id, ModelFile.README)
+        assert os.path.exists(file_path)
+        file_path = model_file_download(
+            self.model_id, ModelFile.README, local_files_only=True)
+        assert os.path.exists(file_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_private_repository.py b/tests/hub/test_hub_private_repository.py
index b6e3536c..01a89586 100644
--- a/tests/hub/test_hub_private_repository.py
+++ b/tests/hub/test_hub_private_repository.py
@@ -5,6 +5,7 @@ import unittest
 import uuid
 
 from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.errors import GitError
 from modelscope.hub.repository import Repository
 
@@ -16,9 +17,6 @@ model_chinese_name = '达摩卡通化模型'
 model_org = 'unittest'
 DEFAULT_GIT_PATH = 'git'
 
-sample_model_url = 'https://mindscope.oss-cn-hangzhou.aliyuncs.com/test_models/mnist-12.onnx'
-download_model_file_name = 'mnist-12.onnx'
-
 
 class HubPrivateRepositoryTest(unittest.TestCase):
 
@@ -31,9 +29,10 @@ class HubPrivateRepositoryTest(unittest.TestCase):
         self.model_id = '%s/%s' % (model_org, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
+            visibility=ModelVisibility.PRIVATE,  # 1-private, 5-public
+            license=Licenses.APACHE_V2,
             chinese_name=model_chinese_name,
-            visibility=1,  # 1-private, 5-public
-            license='apache-2.0')
+        )
 
     def tearDown(self):
         self.api.login(USER_NAME, PASSWORD)
diff --git a/tests/hub/test_hub_repository.py b/tests/hub/test_hub_repository.py
index 7b1cc751..99f63eca 100644
--- a/tests/hub/test_hub_repository.py
+++ b/tests/hub/test_hub_repository.py
@@ -2,7 +2,6 @@
 import os
 import shutil
 import tempfile
-import time
 import unittest
 import uuid
 from os.path import expanduser
@@ -10,6 +9,7 @@ from os.path import expanduser
 from requests import delete
 
 from modelscope.hub.api import HubApi
+from modelscope.hub.constants import Licenses, ModelVisibility
 from modelscope.hub.errors import NotExistError
 from modelscope.hub.file_download import model_file_download
 from modelscope.hub.repository import Repository
@@ -55,9 +55,10 @@ class HubRepositoryTest(unittest.TestCase):
         self.model_id = '%s/%s' % (model_org, self.model_name)
         self.api.create_model(
             model_id=self.model_id,
+            visibility=ModelVisibility.PUBLIC,  # 1-private, 5-public
+            license=Licenses.APACHE_V2,
             chinese_name=model_chinese_name,
-            visibility=5,  # 1-private, 5-public
-            license='apache-2.0')
+        )
         temporary_dir = tempfile.mkdtemp()
         self.model_dir = os.path.join(temporary_dir, self.model_name)
 
@@ -81,27 +82,12 @@ class HubRepositoryTest(unittest.TestCase):
         os.chdir(self.model_dir)
         os.system("echo '111'>%s" % os.path.join(self.model_dir, 'add1.py'))
         os.system("echo '222'>%s" % os.path.join(self.model_dir, 'add2.py'))
-        repo.push('test', all_files=True)
+        repo.push('test')
         add1 = model_file_download(self.model_id, 'add1.py')
         assert os.path.exists(add1)
         add2 = model_file_download(self.model_id, 'add2.py')
         assert os.path.exists(add2)
 
-    def test_push_files(self):
-        repo = Repository(self.model_dir, clone_from=self.model_id)
-        assert os.path.exists(os.path.join(self.model_dir, 'README.md'))
-        os.system("echo '111'>%s" % os.path.join(self.model_dir, 'add1.py'))
-        os.system("echo '222'>%s" % os.path.join(self.model_dir, 'add2.py'))
-        os.system("echo '333'>%s" % os.path.join(self.model_dir, 'add3.py'))
-        repo.push('test', files=['add1.py', 'add2.py'], all_files=False)
-        add1 = model_file_download(self.model_id, 'add1.py')
-        assert os.path.exists(add1)
-        add2 = model_file_download(self.model_id, 'add2.py')
-        assert os.path.exists(add2)
-        with self.assertRaises(NotExistError) as cm:
-            model_file_download(self.model_id, 'add3.py')
-        print(cm.exception)
-
 
 if __name__ == '__main__':
     unittest.main()

From 1cb2fa850f2f9b468798b062bb4bd23065eeea88 Mon Sep 17 00:00:00 2001
From: "wenmeng.zwm" <wenmeng.zwm@alibaba-inc.com>
Date: Tue, 28 Jun 2022 22:19:37 +0800
Subject: [PATCH 3/6] [to #42362425] update version with 0.2.1

---
 modelscope/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelscope/version.py b/modelscope/version.py
index df9144c5..fc79d63d 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = '0.2.1'

From 576b7cffb11532c3431fbfc2998ae833408c327b Mon Sep 17 00:00:00 2001
From: "zhangzhicheng.zzc" <zhangzhicheng.zzc@alibaba-inc.com>
Date: Wed, 29 Jun 2022 09:12:59 +0800
Subject: [PATCH 4/6] [to #42322933] add pipeline params for  preprocess and
 forward & zeroshot classification         Link:
 https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9180863

---
 modelscope/metainfo.py                        |  2 +
 modelscope/models/__init__.py                 |  3 +-
 modelscope/models/nlp/__init__.py             |  1 +
 .../nlp/sbert_for_zero_shot_classification.py | 50 ++++++++++
 modelscope/pipelines/base.py                  | 55 ++++++++---
 modelscope/pipelines/builder.py               |  3 +
 modelscope/pipelines/nlp/__init__.py          |  1 +
 .../nlp/zero_shot_classification_pipeline.py  | 97 +++++++++++++++++++
 modelscope/pipelines/outputs.py               |  7 ++
 modelscope/preprocessors/nlp.py               | 46 ++++++++-
 modelscope/utils/constant.py                  |  1 +
 .../test_zero_shot_classification.py          | 64 ++++++++++++
 12 files changed, 313 insertions(+), 17 deletions(-)
 create mode 100644 modelscope/models/nlp/sbert_for_zero_shot_classification.py
 create mode 100644 modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
 create mode 100644 tests/pipelines/test_zero_shot_classification.py

diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index eda590ac..1d2ee4d2 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -52,6 +52,7 @@ class Pipelines(object):
     text_generation = 'text-generation'
     sentiment_analysis = 'sentiment-analysis'
     fill_mask = 'fill-mask'
+    zero_shot_classification = 'zero-shot-classification'
 
     # audio tasks
     sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
@@ -95,6 +96,7 @@ class Preprocessors(object):
     bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
     palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
     sbert_token_cls_tokenizer = 'sbert-token-cls-tokenizer'
+    zero_shot_cls_tokenizer = 'zero-shot-cls-tokenizer'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py
index 816c44e2..f1074f68 100644
--- a/modelscope/models/__init__.py
+++ b/modelscope/models/__init__.py
@@ -7,4 +7,5 @@ from .audio.tts.vocoder import Hifigan16k
 from .base import Model
 from .builder import MODELS, build_model
 from .multi_modal import OfaForImageCaptioning
-from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
+from .nlp import (BertForSequenceClassification, SbertForSentenceSimilarity,
+                  SbertForZeroShotClassification)
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 6be4493b..f904efdf 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -3,3 +3,4 @@ from .masked_language_model import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
+from .sbert_for_zero_shot_classification import *  # noqa F403
diff --git a/modelscope/models/nlp/sbert_for_zero_shot_classification.py b/modelscope/models/nlp/sbert_for_zero_shot_classification.py
new file mode 100644
index 00000000..837bb41e
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_zero_shot_classification.py
@@ -0,0 +1,50 @@
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.utils.constant import Tasks
+from ...metainfo import Models
+from ..base import Model
+from ..builder import MODELS
+
+__all__ = ['SbertForZeroShotClassification']
+
+
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.structbert)
+class SbertForZeroShotClassification(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the zero shot classification model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from sofa import SbertForSequenceClassification
+        self.model = SbertForSequenceClassification.from_pretrained(model_dir)
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+                Example:
+                    {
+                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
+                    }
+        """
+        outputs = self.model(**input)
+        logits = outputs['logits'].numpy()
+        res = {'logits': logits}
+        return res
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 2f5d5dcc..4052d35a 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -74,33 +74,57 @@ class Pipeline(ABC):
         self.preprocessor = preprocessor
 
     def __call__(self, input: Union[Input, List[Input]], *args,
-                 **post_kwargs) -> Union[Dict[str, Any], Generator]:
+                 **kwargs) -> Union[Dict[str, Any], Generator]:
         # model provider should leave it as it is
         # modelscope library developer will handle this function
 
         # simple showcase, need to support iterator type for both tensorflow and pytorch
         # input_dict = self._handle_input(input)
+
+        # sanitize the parameters
+        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(
+            **kwargs)
+        kwargs['preprocess_params'] = preprocess_params
+        kwargs['forward_params'] = forward_params
+        kwargs['postprocess_params'] = postprocess_params
+
         if isinstance(input, list):
             output = []
             for ele in input:
-                output.append(self._process_single(ele, *args, **post_kwargs))
+                output.append(self._process_single(ele, *args, **kwargs))
 
         elif isinstance(input, MsDataset):
-            return self._process_iterator(input, *args, **post_kwargs)
+            return self._process_iterator(input, *args, **kwargs)
 
         else:
-            output = self._process_single(input, *args, **post_kwargs)
+            output = self._process_single(input, *args, **kwargs)
         return output
 
-    def _process_iterator(self, input: Input, *args, **post_kwargs):
+    def _sanitize_parameters(self, **pipeline_parameters):
+        """
+        this method should sanitize the keyword args to preprocessor params,
+        forward params and postprocess params on '__call__' or '_process_single' method
+        considered to be a normal classmethod with default implementation / output
+
+        Default Returns:
+            Dict[str, str]:  preprocess_params = {}
+            Dict[str, str]:  forward_params = {}
+            Dict[str, str]:  postprocess_params = pipeline_parameters
+        """
+        return {}, {}, pipeline_parameters
+
+    def _process_iterator(self, input: Input, *args, **kwargs):
         for ele in input:
-            yield self._process_single(ele, *args, **post_kwargs)
+            yield self._process_single(ele, *args, **kwargs)
+
+    def _process_single(self, input: Input, *args, **kwargs) -> Dict[str, Any]:
+        preprocess_params = kwargs.get('preprocess_params')
+        forward_params = kwargs.get('forward_params')
+        postprocess_params = kwargs.get('postprocess_params')
 
-    def _process_single(self, input: Input, *args,
-                        **post_kwargs) -> Dict[str, Any]:
-        out = self.preprocess(input)
-        out = self.forward(out)
-        out = self.postprocess(out, **post_kwargs)
+        out = self.preprocess(input, **preprocess_params)
+        out = self.forward(out, **forward_params)
+        out = self.postprocess(out, **postprocess_params)
         self._check_output(out)
         return out
 
@@ -120,20 +144,21 @@ class Pipeline(ABC):
             raise ValueError(f'expected output keys are {output_keys}, '
                              f'those {missing_keys} are missing')
 
-    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
         """ Provide default implementation based on preprocess_cfg and user can reimplement it
         """
         assert self.preprocessor is not None, 'preprocess method should be implemented'
         assert not isinstance(self.preprocessor, List),\
             'default implementation does not support using multiple preprocessors.'
-        return self.preprocessor(inputs)
+        return self.preprocessor(inputs, **preprocess_params)
 
-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         """ Provide default implementation using self.model and user can reimplement it
         """
         assert self.model is not None, 'forward method should be implemented'
         assert not self.has_multiple_models, 'default implementation does not support multiple models in a pipeline.'
-        return self.model(inputs)
+        return self.model(inputs, **forward_params)
 
     @abstractmethod
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 41cd73da..847955d4 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -27,6 +27,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                 'damo/bert-base-sst2'),
     Tasks.text_generation: (Pipelines.text_generation,
                             'damo/nlp_palm2.0_text-generation_chinese-base'),
+    Tasks.zero_shot_classification:
+    (Pipelines.zero_shot_classification,
+     'damo/nlp_structbert_zero-shot-classification_chinese-base'),
     Tasks.image_captioning: (Pipelines.image_caption,
                              'damo/ofa_image-caption_coco_large_en'),
     Tasks.image_generation:
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index c50875fd..5ef12e22 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -3,3 +3,4 @@ from .sentence_similarity_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
 from .word_segmentation_pipeline import *  # noqa F403
+from .zero_shot_classification_pipeline import *  # noqa F403
diff --git a/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
new file mode 100644
index 00000000..2ed4dac3
--- /dev/null
+++ b/modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
@@ -0,0 +1,97 @@
+import os
+import uuid
+from typing import Any, Dict, Union
+
+import json
+import numpy as np
+import torch
+from scipy.special import softmax
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForZeroShotClassification
+from ...preprocessors import ZeroShotClassificationPreprocessor
+from ...utils.constant import Tasks
+from ..base import Input, Pipeline
+from ..builder import PIPELINES
+
+__all__ = ['ZeroShotClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.zero_shot_classification,
+    module_name=Pipelines.zero_shot_classification)
+class ZeroShotClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[SbertForZeroShotClassification, str],
+                 preprocessor: ZeroShotClassificationPreprocessor = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (SbertForSentimentClassification): a model instance
+            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, SbertForZeroShotClassification), \
+            'model must be a single str or SbertForZeroShotClassification'
+        model = model if isinstance(
+            model,
+            SbertForZeroShotClassification) else Model.from_pretrained(model)
+
+        self.entailment_id = 0
+        self.contradiction_id = 2
+
+        if preprocessor is None:
+            preprocessor = ZeroShotClassificationPreprocessor(model.model_dir)
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        postprocess_params = {}
+
+        if 'candidate_labels' in kwargs:
+            candidate_labels = kwargs.pop('candidate_labels')
+            preprocess_params['candidate_labels'] = candidate_labels
+            postprocess_params['candidate_labels'] = candidate_labels
+        else:
+            raise ValueError('You must include at least one label.')
+        preprocess_params['hypothesis_template'] = kwargs.pop(
+            'hypothesis_template', '{}')
+
+        postprocess_params['multi_label'] = kwargs.pop('multi_label', False)
+        return preprocess_params, {}, postprocess_params
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    candidate_labels,
+                    multi_label=False) -> Dict[str, Any]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the prediction results
+        """
+
+        logits = inputs['logits']
+        if multi_label or len(candidate_labels) == 1:
+            logits = logits[..., [self.contradiction_id, self.entailment_id]]
+            scores = softmax(logits, axis=-1)[..., 1]
+        else:
+            logits = logits[..., self.entailment_id]
+            scores = softmax(logits, axis=-1)
+
+        reversed_index = list(reversed(scores.argsort()))
+        result = {
+            'labels': [candidate_labels[i] for i in reversed_index],
+            'scores': [scores[i].item() for i in reversed_index],
+        }
+        return result
diff --git a/modelscope/pipelines/outputs.py b/modelscope/pipelines/outputs.py
index 52b7eeae..290e6717 100644
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -101,6 +101,13 @@ TASK_OUTPUTS = {
     #   }
     Tasks.sentence_similarity: ['scores', 'labels'],
 
+    # zero-shot classification result for single sample
+    #   {
+    #       "labels": ["happy", "sad", "calm", "angry"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.zero_shot_classification: ['scores', 'labels'],
+
     # ============ audio tasks ===================
 
     # audio processed for single file in PCM format
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 4ed63f3c..e8e33e74 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -14,7 +14,7 @@ from .builder import PREPROCESSORS
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
     'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor',
-    'FillMaskPreprocessor'
+    'FillMaskPreprocessor', 'ZeroShotClassificationPreprocessor'
 ]
 
 
@@ -286,3 +286,47 @@ class TokenClassifcationPreprocessor(Preprocessor):
             'attention_mask': attention_mask,
             'token_type_ids': token_type_ids
         }
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
+class ZeroShotClassificationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        from sofa import SbertTokenizer
+        self.model_dir: str = model_dir
+        self.sequence_length = kwargs.pop('sequence_length', 512)
+        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, str)
+    def __call__(self, data: str, hypothesis_template: str,
+                 candidate_labels: list) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        pairs = [[data, hypothesis_template.format(label)]
+                 for label in candidate_labels]
+
+        features = self.tokenizer(
+            pairs,
+            padding=True,
+            truncation=True,
+            max_length=self.sequence_length,
+            return_tensors='pt',
+            truncation_strategy='only_first')
+        return features
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 55f015e8..44bd1dff 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -48,6 +48,7 @@ class Tasks(object):
     fill_mask = 'fill-mask'
     summarization = 'summarization'
     question_answering = 'question-answering'
+    zero_shot_classification = 'zero-shot-classification'
 
     # audio tasks
     auto_speech_recognition = 'auto-speech-recognition'
diff --git a/tests/pipelines/test_zero_shot_classification.py b/tests/pipelines/test_zero_shot_classification.py
new file mode 100644
index 00000000..b76a6a86
--- /dev/null
+++ b/tests/pipelines/test_zero_shot_classification.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForZeroShotClassification
+from modelscope.pipelines import ZeroShotClassificationPipeline, pipeline
+from modelscope.preprocessors import ZeroShotClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ZeroShotClassificationTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_zero-shot-classification_chinese-base'
+    sentence = '全新突破 解放军运20版空中加油机曝光'
+    labels = ['文化', '体育', '娱乐', '财经', '家居', '汽车', '教育', '科技', '军事']
+    template = '这篇文章的标题是{}'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = ZeroShotClassificationPreprocessor(cache_path)
+        model = SbertForZeroShotClassification(cache_path, tokenizer=tokenizer)
+        pipeline1 = ZeroShotClassificationPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.zero_shot_classification,
+            model=model,
+            preprocessor=tokenizer)
+
+        print(
+            f'sentence: {self.sentence}\n'
+            f'pipeline1:{pipeline1(input=self.sentence,candidate_labels=self.labels)}'
+        )
+        print()
+        print(
+            f'sentence: {self.sentence}\n'
+            f'pipeline2: {pipeline2(self.sentence,candidate_labels=self.labels,hypothesis_template=self.template)}'
+        )
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = ZeroShotClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.zero_shot_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.zero_shot_classification, model=self.model_id)
+        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.zero_shot_classification)
+        print(pipeline_ins(input=self.sentence, candidate_labels=self.labels))
+
+
+if __name__ == '__main__':
+    unittest.main()

From fabea5604e5795ce5cd341090865cf409490b062 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Wed, 29 Jun 2022 11:08:34 +0800
Subject: [PATCH 5/6] [to #42322933] Add MPLUG model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加 MPLUG 模型的 visual question answering 任务 pipeline

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9182119
---
 data/test/images/image_mplug_vqa.jpg          |  3 +
 modelscope/metainfo.py                        |  3 +
 modelscope/models/multi_modal/__init__.py     |  2 +
 .../mplug_for_visual_question_answering.py    | 46 +++++++++++++
 modelscope/pipelines/builder.py               |  5 +-
 modelscope/pipelines/multi_modal/__init__.py  |  1 +
 .../visual_question_answering_pipeline.py     | 65 +++++++++++++++++++
 modelscope/preprocessors/__init__.py          |  2 +-
 modelscope/preprocessors/multi_modal.py       | 45 +++++++++++++
 modelscope/utils/constant.py                  |  1 +
 requirements/nlp.txt                          |  2 +-
 .../test_visual_question_answering.py         | 60 +++++++++++++++++
 12 files changed, 232 insertions(+), 3 deletions(-)
 create mode 100644 data/test/images/image_mplug_vqa.jpg
 create mode 100644 modelscope/models/multi_modal/mplug_for_visual_question_answering.py
 create mode 100644 modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
 create mode 100644 tests/pipelines/test_visual_question_answering.py

diff --git a/data/test/images/image_mplug_vqa.jpg b/data/test/images/image_mplug_vqa.jpg
new file mode 100644
index 00000000..57919471
--- /dev/null
+++ b/data/test/images/image_mplug_vqa.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b37b706885849037b5fa7fa44a3b78a6375f768d95ce46bfcb8e7329d038a692
+size 181725
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 1d2ee4d2..485605bb 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -27,6 +27,7 @@ class Models(object):
     # multi-modal models
     ofa = 'ofa'
     clip = 'clip-multi-modal-embedding'
+    mplug = 'mplug'
 
 
 class Pipelines(object):
@@ -63,6 +64,7 @@ class Pipelines(object):
     # multi-modal tasks
     image_caption = 'image-caption'
     multi_modal_embedding = 'multi-modal-embedding'
+    visual_question_answering = 'visual-question-answering'
 
 
 class Trainers(object):
@@ -105,3 +107,4 @@ class Preprocessors(object):
 
     # multi-modal
     ofa_image_caption = 'ofa-image-caption'
+    mplug_visual_question_answering = 'mplug-visual-question-answering'
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 2e6cc3bf..4ed9809b 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -1,2 +1,4 @@
 from .clip.clip_model import CLIPForMultiModalEmbedding
 from .image_captioning_model import OfaForImageCaptioning
+from .mplug_for_visual_question_answering import \
+    MPlugForVisualQuestionAnswering
diff --git a/modelscope/models/multi_modal/mplug_for_visual_question_answering.py b/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
new file mode 100644
index 00000000..2682c048
--- /dev/null
+++ b/modelscope/models/multi_modal/mplug_for_visual_question_answering.py
@@ -0,0 +1,46 @@
+from typing import Dict
+
+from ...metainfo import Models
+from ...utils.constant import Tasks
+from ..base import Model, Tensor
+from ..builder import MODELS
+
+__all__ = ['MPlugForVisualQuestionAnswering']
+
+
+@MODELS.register_module(
+    Tasks.visual_question_answering, module_name=Models.mplug)
+class MPlugForVisualQuestionAnswering(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the mplug model from the `model_dir` path.
+        Args:
+            model_dir (str): the model path.
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from sofa.models.mplug import MPlugForVisualQuestionAnswering
+        self.model = MPlugForVisualQuestionAnswering.from_pretrained(model_dir)
+        self.tokenizer = self.model.tokenizer
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]),
+                    }
+        """
+
+        return self.model(**input)[0]
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 847955d4..2f66682d 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -42,7 +42,10 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                'damo/cv_TAdaConv_action-recognition'),
     Tasks.multi_modal_embedding:
     (Pipelines.multi_modal_embedding,
-     'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding')
+     'damo/multi-modal_clip-vit-large-patch14-chinese_multi-modal-embedding'),
+    Tasks.visual_question_answering:
+    (Pipelines.visual_question_answering,
+     'damo/mplug_visual-question-answering_coco_large_en'),
 }
 
 
diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py
index 6c96d843..fdcada89 100644
--- a/modelscope/pipelines/multi_modal/__init__.py
+++ b/modelscope/pipelines/multi_modal/__init__.py
@@ -1,2 +1,3 @@
 from .image_captioning_pipeline import ImageCaptionPipeline
 from .multi_modal_embedding_pipeline import MultiModalEmbeddingPipeline
+from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
new file mode 100644
index 00000000..97c8cf7b
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -0,0 +1,65 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.multi_modal import MPlugForVisualQuestionAnswering
+from ...preprocessors import MPlugVisualQuestionAnsweringPreprocessor
+from ...utils.constant import Tasks
+from ..base import Pipeline, Tensor
+from ..builder import PIPELINES
+
+__all__ = ['VisualQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.visual_question_answering,
+    module_name=Pipelines.visual_question_answering)
+class VisualQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MPlugForVisualQuestionAnswering, str],
+                 preprocessor: Optional[
+                     MPlugVisualQuestionAnsweringPreprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a visual question answering pipeline for prediction
+
+        Args:
+            model (MPlugForVisualQuestionAnswering): a model instance
+            preprocessor (MPlugVisualQuestionAnsweringPreprocessor): a preprocessor instance
+        """
+        model = model if isinstance(
+            model,
+            MPlugForVisualQuestionAnswering) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
+                model.model_dir)
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.tokenizer = model.tokenizer
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        replace_tokens_bert = (('[unused0]', ''), ('[PAD]', ''),
+                               ('[unused1]', ''), (r' +', ' '), ('[SEP]', ''),
+                               ('[unused2]', ''), ('[CLS]', ''), ('[UNK]', ''))
+
+        pred_string = self.tokenizer.decode(inputs[0][0])
+        for _old, _new in replace_tokens_bert:
+            pred_string = pred_string.replace(_old, _new)
+        pred_string.strip()
+        return {'answer': pred_string}
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 1bc06ce3..694688f6 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -6,6 +6,6 @@ from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
 from .image import LoadImage, load_image
 from .kws import WavToLists
-from .multi_modal import OfaImageCaptionPreprocessor
+from .multi_modal import *  # noqa F403
 from .nlp import *  # noqa F403
 from .text_to_speech import *  # noqa F403
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 7c8f0fab..1bc686eb 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -16,6 +16,7 @@ from .image import load_image
 
 __all__ = [
     'OfaImageCaptionPreprocessor',
+    'MPlugVisualQuestionAnsweringPreprocessor',
 ]
 
 
@@ -110,3 +111,47 @@ class OfaImageCaptionPreprocessor(Preprocessor):
             }
         }
         return sample
+
+
+@PREPROCESSORS.register_module(
+    Fields.multi_modal,
+    module_name=Preprocessors.mplug_visual_question_answering)
+class MPlugVisualQuestionAnsweringPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via 'bert-base-uncased' tokenizer and configuration
+
+        """
+        super().__init__(*args, **kwargs)
+
+        # tokenizer
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+
+        # load configuration
+        from sofa.models.mplug import CONFIG_NAME, MPlugConfig
+        config = MPlugConfig.from_yaml_file(osp.join(model_dir, CONFIG_NAME))
+
+        # Initialize transform
+        from torchvision import transforms
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+
+        self.patch_resize_transform = transforms.Compose([
+            transforms.Resize((config.image_res, config.image_res),
+                              interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        image, question = data['image'], data['question']
+        image = Image.open(image).convert('RGB') if isinstance(image,
+                                                               str) else image
+        image = self.patch_resize_transform(image)
+        image = torch.stack([image], dim=0)
+        question = self.tokenizer([question.lower()],
+                                  padding='longest',
+                                  return_tensors='pt')
+
+        return {'image': image, 'question': question, 'train': False}
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index 44bd1dff..3ce3ab98 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -61,6 +61,7 @@ class Tasks(object):
     visual_grounding = 'visual-grounding'
     text_to_image_synthesis = 'text-to-image-synthesis'
     multi_modal_embedding = 'multi-modal-embedding'
+    visual_question_answering = 'visual-question-answering'
 
 
 class InputFields(object):
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 261b9ec5..574bf856 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1 +1 @@
-https://alinlp.alibaba-inc.com/pypi/sofa-1.0.3-py3-none-any.whl
+https://alinlp.alibaba-inc.com/pypi/sofa-1.0.4.1-py3-none-any.whl
diff --git a/tests/pipelines/test_visual_question_answering.py b/tests/pipelines/test_visual_question_answering.py
new file mode 100644
index 00000000..4577607e
--- /dev/null
+++ b/tests/pipelines/test_visual_question_answering.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.multi_modal import MPlugForVisualQuestionAnswering
+from modelscope.pipelines import VisualQuestionAnsweringPipeline, pipeline
+from modelscope.preprocessors import MPlugVisualQuestionAnsweringPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class VisualQuestionAnsweringTest(unittest.TestCase):
+    model_id = 'damo/mplug_visual-question-answering_coco_large_en'
+    input_vqa = {
+        'image': 'data/test/images/image_mplug_vqa.jpg',
+        'question': 'What is the woman doing?',
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run(self):
+        cache_path = snapshot_download(self.model_id)
+        preprocessor = MPlugVisualQuestionAnsweringPreprocessor(cache_path)
+        model = MPlugForVisualQuestionAnswering(cache_path)
+        pipeline1 = VisualQuestionAnsweringPipeline(
+            model, preprocessor=preprocessor)
+        pipeline2 = pipeline(
+            Tasks.visual_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        print(f"question: {self.input_vqa['question']}")
+        print(f"pipeline1: {pipeline1(self.input_vqa)['answer']}")
+        print(f"pipeline2: {pipeline2(self.input_vqa)['answer']}")
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = MPlugVisualQuestionAnsweringPreprocessor(
+            model.model_dir)
+        pipeline_vqa = pipeline(
+            task=Tasks.visual_question_answering,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_vqa(self.input_vqa))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_vqa = pipeline(
+            Tasks.visual_question_answering, model=self.model_id)
+        print(pipeline_vqa(self.input_vqa))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_vqa = pipeline(task=Tasks.visual_question_answering)
+        print(pipeline_vqa(self.input_vqa))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 0264b25a600f184a840cfdbaf16a5c21e91a0206 Mon Sep 17 00:00:00 2001
From: "mulin.lyh" <mulin.lyh@taobao.com>
Date: Thu, 30 Jun 2022 13:54:49 +0800
Subject: [PATCH 6/6] [to #42322933]modify to new ip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修改为新的服务ip
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9217420

    * modify modelhub datahub to new ip, fix image-caption case bug
---
 modelscope/hub/constants.py     | 2 +-
 modelscope/metainfo.py          | 2 +-
 modelscope/msdatasets/config.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 08f7c31d..0ee451c2 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -1,5 +1,5 @@
 MODELSCOPE_URL_SCHEME = 'http://'
-DEFAULT_MODELSCOPE_DOMAIN = '101.201.119.157:32330'
+DEFAULT_MODELSCOPE_DOMAIN = '47.94.223.21:31090'
 DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102'
 
 DEFAULT_MODELSCOPE_GROUP = 'damo'
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 485605bb..e42b1233 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -62,7 +62,7 @@ class Pipelines(object):
     kws_kwsbp = 'kws-kwsbp'
 
     # multi-modal tasks
-    image_caption = 'image-caption'
+    image_caption = 'image-captioning'
     multi_modal_embedding = 'multi-modal-embedding'
     visual_question_answering = 'visual-question-answering'
 
diff --git a/modelscope/msdatasets/config.py b/modelscope/msdatasets/config.py
index 00c24c3a..22390ed7 100644
--- a/modelscope/msdatasets/config.py
+++ b/modelscope/msdatasets/config.py
@@ -19,4 +19,4 @@ DOWNLOADED_DATASETS_PATH = Path(
     os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))
 
 MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
-                                 'http://123.57.189.90:31752')
+                                 'http://47.94.223.21:31752')