merge nlp

3 years ago · 80461600f7
--- a/data/test/images/image_captioning.png
+++ b/data/test/images/image_captioning.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
 size 603621
--- a/data/test/images/ocr_detection.jpg
+++ b/data/test/images/ocr_detection.jpg
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94
 size 141149
--- a/data/test/videos/action_recognition_test_video.mp4
+++ b/data/test/videos/action_recognition_test_video.mp4
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:24dc4237b1197321ee8486bb983fa01fd47e2b4afdb3c2df24229e5f2bd20119
 size 1475924
--- a/modelscope/pipelines/nlp/space/init.py
+++ b/modelscope/pipelines/nlp/space/init.py
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -0,0 +1,265 @@
 import imp
 import os
 import pickle
 import subprocess
 from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import List, Optional, Tuple, Union

 import requests

 from modelscope.utils.logger import get_logger
 from .constants import LOGGER_NAME
 from .errors import NotExistError, is_ok, raise_on_error
 from .utils.utils import get_endpoint, model_id_to_group_owner_name

 logger = get_logger()


 class HubApi:

    def __init__(self, endpoint=None):
        self.endpoint = endpoint if endpoint is not None else get_endpoint()

    def login(
        self,
        user_name: str,
        password: str,
    ) -> tuple():
        """
        Login with username and password

        Args:
            username(`str`): user name on modelscope
            password(`str`): password

        Returns:
            cookies: to authenticate yourself to ModelScope open-api
            gitlab token: to access private repos

        <Tip>
            You only have to login once within 30 days.
        </Tip>

        TODO: handle cookies expire

        """
        path = f'{self.endpoint}/api/v1/login'
        r = requests.post(
            path, json={
                'username': user_name,
                'password': password
            })
        r.raise_for_status()
        d = r.json()
        raise_on_error(d)

        token = d['Data']['AccessToken']
        cookies = r.cookies

        # save token and cookie
        ModelScopeConfig.save_token(token)
        ModelScopeConfig.save_cookies(cookies)
        ModelScopeConfig.write_to_git_credential(user_name, password)

        return d['Data']['AccessToken'], cookies

    def create_model(self, model_id: str, chinese_name: str, visibility: int,
                     license: str) -> str:
        """
        Create model repo at ModelScopeHub

        Args:
            model_id:(`str`): The model id
            chinese_name(`str`): chinese name of the model
            visibility(`int`): visibility of the model(1-private, 3-internal, 5-public)
            license(`str`): license of the model, candidates can be found at: TBA

        Returns:
            name of the model created

        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        cookies = ModelScopeConfig.get_cookies()
        if cookies is None:
            raise ValueError('Token does not exist, please login first.')

        path = f'{self.endpoint}/api/v1/models'
        owner_or_group, name = model_id_to_group_owner_name(model_id)
        r = requests.post(
            path,
            json={
                'Path': owner_or_group,
                'Name': name,
                'ChineseName': chinese_name,
                'Visibility': visibility,
                'License': license
            },
            cookies=cookies)
        r.raise_for_status()
        raise_on_error(r.json())
        d = r.json()
        return d['Data']['Name']

    def delete_model(self, model_id):
        """_summary_

        Args:
            model_id (str): The model id.
        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        cookies = ModelScopeConfig.get_cookies()
        path = f'{self.endpoint}/api/v1/models/{model_id}'

        r = requests.delete(path, cookies=cookies)
        r.raise_for_status()
        raise_on_error(r.json())

    def get_model_url(self, model_id):
        return f'{self.endpoint}/api/v1/models/{model_id}.git'

    def get_model(
        self,
        model_id: str,
        revision: str = 'master',
    ) -> str:
        """
        Get model information at modelscope_hub

        Args:
            model_id(`str`): The model id.
            revision(`str`): revision of model
        Returns:
            The model details information.
        Raises:
            NotExistError: If the model is not exist, will throw NotExistError
        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        cookies = ModelScopeConfig.get_cookies()
        owner_or_group, name = model_id_to_group_owner_name(model_id)
        path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}?{revision}'

        r = requests.get(path, cookies=cookies)
        if r.status_code == 200:
            if is_ok(r.json()):
                return r.json()['Data']
            else:
                raise NotExistError(r.json()['Message'])
        else:
            r.raise_for_status()

    def get_model_branches_and_tags(
        self,
        model_id: str,
    ) -> Tuple[List[str], List[str]]:
        cookies = ModelScopeConfig.get_cookies()

        path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
        r = requests.get(path, cookies=cookies)
        r.raise_for_status()
        d = r.json()
        raise_on_error(d)
        info = d['Data']
        branches = [x['Revision'] for x in info['RevisionMap']['Branches']
                    ] if info['RevisionMap']['Branches'] else []
        tags = [x['Revision'] for x in info['RevisionMap']['Tags']
                ] if info['RevisionMap']['Tags'] else []
        return branches, tags

    def get_model_files(
            self,
            model_id: str,
            revision: Optional[str] = 'master',
            root: Optional[str] = None,
            recursive: Optional[str] = False,
            use_cookies: Union[bool, CookieJar] = False) -> List[dict]:

        cookies = None
        if isinstance(use_cookies, CookieJar):
            cookies = use_cookies
        elif use_cookies:
            cookies = ModelScopeConfig.get_cookies()
            if cookies is None:
                raise ValueError('Token does not exist, please login first.')

        path = f'{self.endpoint}/api/v1/models/{model_id}/repo/files?Revision={revision}&Recursive={recursive}'
        if root is not None:
            path = path + f'&Root={root}'

        r = requests.get(path, cookies=cookies)

        r.raise_for_status()
        d = r.json()
        raise_on_error(d)

        files = []
        for file in d['Data']['Files']:
            if file['Name'] == '.gitignore' or file['Name'] == '.gitattributes':
                continue

            files.append(file)
        return files


 class ModelScopeConfig:
    path_credential = expanduser('~/.modelscope/credentials')
    os.makedirs(path_credential, exist_ok=True)

    @classmethod
    def save_cookies(cls, cookies: CookieJar):
        with open(os.path.join(cls.path_credential, 'cookies'), 'wb+') as f:
            pickle.dump(cookies, f)

    @classmethod
    def get_cookies(cls):
        try:
            with open(os.path.join(cls.path_credential, 'cookies'), 'rb') as f:
                return pickle.load(f)
        except FileNotFoundError:
            logger.warn("Auth token does not exist, you'll get authentication \
                error when downloading private model files. Please login first"
                        )

    @classmethod
    def save_token(cls, token: str):
        with open(os.path.join(cls.path_credential, 'token'), 'w+') as f:
            f.write(token)

    @classmethod
    def get_token(cls) -> Optional[str]:
        """
        Get token or None if not existent.

        Returns:
            `str` or `None`: The token, `None` if it doesn't exist.

        """
        token = None
        try:
            with open(os.path.join(cls.path_credential, 'token'), 'r') as f:
                token = f.read()
        except FileNotFoundError:
            pass
        return token

    @staticmethod
    def write_to_git_credential(username: str, password: str):
        with subprocess.Popen(
                'git credential-store store'.split(),
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
        ) as process:
            input_username = f'username={username.lower()}'
            input_password = f'password={password}'

            process.stdin.write(
                f'url={get_endpoint()}\n{input_username}\n{input_password}\n\n'
                .encode('utf-8'))
            process.stdin.flush()
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -0,0 +1,8 @@
 MODELSCOPE_URL_SCHEME = 'http://'
 DEFAULT_MODELSCOPE_DOMAIN = '101.201.119.157:32330'
 DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102'

 DEFAULT_MODELSCOPE_GROUP = 'damo'
 MODEL_ID_SEPARATOR = '/'

 LOGGER_NAME = 'ModelScopeHub'
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -0,0 +1,30 @@
 class NotExistError(Exception):
    pass


 class RequestError(Exception):
    pass


 def is_ok(rsp):
    """ Check the request is ok

    Args:
        rsp (_type_): The request response body
        Failed: {'Code': 10010101004, 'Message': 'get model info failed, err: unauthorized permission',
                 'RequestId': '', 'Success': False}
        Success: {'Code': 200, 'Data': {}, 'Message': 'success', 'RequestId': '', 'Success': True}
    """
    return rsp['Code'] == 200 and rsp['Success']


 def raise_on_error(rsp):
    """If response error, raise exception

    Args:
        rsp (_type_): The server response
    """
    if rsp['Code'] == 200 and rsp['Success']:
        return True
    else:
        raise RequestError(rsp['Message'])
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -0,0 +1,254 @@
 import copy
 import fnmatch
 import logging
 import os
 import sys
 import tempfile
 import time
 from functools import partial
 from hashlib import sha256
 from pathlib import Path
 from typing import BinaryIO, Dict, Optional, Union
 from uuid import uuid4

 import json
 import requests
 from filelock import FileLock
 from requests.exceptions import HTTPError
 from tqdm import tqdm

 from modelscope import __version__
 from modelscope.utils.logger import get_logger
 from .api import HubApi, ModelScopeConfig
 from .constants import (DEFAULT_MODELSCOPE_GROUP, LOGGER_NAME,
                        MODEL_ID_SEPARATOR)
 from .errors import NotExistError, RequestError, raise_on_error
 from .utils.caching import ModelFileSystemCache
 from .utils.utils import (get_cache_dir, get_endpoint,
                          model_id_to_group_owner_name)

 SESSION_ID = uuid4().hex
 logger = get_logger()


 def model_file_download(
    model_id: str,
    file_path: str,
    revision: Optional[str] = 'master',
    cache_dir: Optional[str] = None,
    user_agent: Union[Dict, str, None] = None,
    local_files_only: Optional[bool] = False,
 ) -> Optional[str]:  # pragma: no cover
    """
    Download from a given URL and cache it if it's not already present in the
    local cache.

    Given a URL, this function looks for the corresponding file in the local
    cache. If it's not there, download it. Then return the path to the cached
    file.

    Args:
        model_id (`str`):
            The model to whom the file to be downloaded belongs.
        file_path(`str`):
            Path of the file to be downloaded, relative to the root of model repo
        revision(`str`, *optional*):
            revision of the model file to be downloaded.
            Can be any of a branch, tag or commit hash, default to `master`
        cache_dir (`str`, `Path`, *optional*):
            Path to the folder where cached files are stored.
        user_agent (`dict`, `str`, *optional*):
            The user-agent info in the form of a dictionary or a string.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, avoid downloading the file and return the path to the
            local cached file if it exists.
            if `False`, download the file anyway even it exists

    Returns:
        Local path (string) of file or if networking is off, last version of
        file cached on disk.

    <Tip>

    Raises the following errors:

        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
          if `use_auth_token=True` and the token cannot be found.
        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
          if ETag cannot be determined.
        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
          if some parameter value is invalid

    </Tip>
    """
    if cache_dir is None:
        cache_dir = get_cache_dir()
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    group_or_owner, name = model_id_to_group_owner_name(model_id)

    cache = ModelFileSystemCache(cache_dir, group_or_owner, name)

    # if local_files_only is `True` and the file already exists in cached_path
    # return the cached path
    if local_files_only:
        cached_file_path = cache.get_file_by_path(file_path)
        if cached_file_path is not None:
            logger.warning(
                "File exists in local cache, but we're not sure it's up to date"
            )
            return cached_file_path
        else:
            raise ValueError(
                'Cannot find the requested files in the cached path and outgoing'
                ' traffic has been disabled. To enable model look-ups and downloads'
                " online, set 'local_files_only' to False.")

    _api = HubApi()
    headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
    branches, tags = _api.get_model_branches_and_tags(model_id)
    file_to_download_info = None
    is_commit_id = False
    if revision in branches or revision in tags:  # The revision is version or tag,
        # we need to confirm the version is up to date
        # we need to get the file list to check if the lateast version is cached, if so return, otherwise download
        model_files = _api.get_model_files(
            model_id=model_id,
            revision=revision,
            recursive=True,
        )

        for model_file in model_files:
            if model_file['Type'] == 'tree':
                continue

            if model_file['Path'] == file_path:
                model_file['Branch'] = revision
                if cache.exists(model_file):
                    return cache.get_file_by_info(model_file)
                else:
                    file_to_download_info = model_file

        if file_to_download_info is None:
            raise NotExistError('The file path: %s not exist in: %s' %
                                (file_path, model_id))
    else:  # the revision is commit id.
        cached_file_path = cache.get_file_by_path_and_commit_id(
            file_path, revision)
        if cached_file_path is not None:
            logger.info('The specified file is in cache, skip downloading!')
            return cached_file_path  # the file is in cache.
        is_commit_id = True
    # we need to download again
    # TODO: skip using JWT for authorization, use cookie instead
    cookies = ModelScopeConfig.get_cookies()
    url_to_download = get_file_download_url(model_id, file_path, revision)
    file_to_download_info = {
        'Path': file_path,
        'Revision':
        revision if is_commit_id else file_to_download_info['Revision']
    }
    # Prevent parallel downloads of the same file with a lock.
    lock_path = cache.get_root_location() + '.lock'

    with FileLock(lock_path):
        temp_file_name = next(tempfile._get_candidate_names())
        http_get_file(
            url_to_download,
            cache_dir,
            temp_file_name,
            headers=headers,
            cookies=None if cookies is None else cookies.get_dict())
        return cache.put_file(file_to_download_info,
                              os.path.join(cache_dir, temp_file_name))


 def http_user_agent(user_agent: Union[Dict, str, None] = None, ) -> str:
    """Formats a user-agent string with basic info about a request.

    Args:
        user_agent (`str`, `dict`, *optional*):
            The user agent info in the form of a dictionary or a single string.

    Returns:
        The formatted user-agent string.
    """
    ua = f'modelscope/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}'

    if isinstance(user_agent, dict):
        ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
    elif isinstance(user_agent, str):
        ua = user_agent
    return ua


 def get_file_download_url(model_id: str, file_path: str, revision: str):
    """
    Format file download url according to `model_id`, `revision` and `file_path`.
    e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
    the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
    """
    download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
    return download_url_template.format(
        endpoint=get_endpoint(),
        model_id=model_id,
        revision=revision,
        file_path=file_path,
    )


 def http_get_file(
    url: str,
    local_dir: str,
    file_name: str,
    cookies: Dict[str, str],
    headers: Optional[Dict[str, str]] = None,
 ):
    """
    Download remote file. Do not gobble up errors.
    This method is only used by snapshot_download, since the behavior is quite different with single file download
    TODO: consolidate with http_get_file() to avoild duplicate code

    Args:
        url(`str`):
            actual download url of the file
        local_dir(`str`):
            local directory where the downloaded file stores
        file_name(`str`):
            name of the file stored in `local_dir`
        cookies(`Dict[str, str]`):
            cookies used to authentication the user, which is used for downloading private repos
        headers(`Optional[Dict[str, str]] = None`):
            http headers to carry necessary info when requesting the remote file

    """
    temp_file_manager = partial(
        tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False)

    with temp_file_manager() as temp_file:
        logger.info('downloading %s to %s', url, temp_file.name)
        headers = copy.deepcopy(headers)

        r = requests.get(url, stream=True, headers=headers, cookies=cookies)
        r.raise_for_status()

        content_length = r.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None

        progress = tqdm(
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
            total=total,
            initial=0,
            desc='Downloading',
        )
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:  # filter out keep-alive new chunks
                progress.update(len(chunk))
                temp_file.write(chunk)
        progress.close()

    logger.info('storing %s in cache at %s', url, local_dir)
    os.replace(temp_file.name, os.path.join(local_dir, file_name))
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -0,0 +1,82 @@
 from threading import local
 from tkinter.messagebox import NO
 from typing import Union

 from modelscope.utils.logger import get_logger
 from .constants import LOGGER_NAME
 from .utils._subprocess import run_subprocess

 logger = get_logger


 def git_clone(
    local_dir: str,
    repo_url: str,
 ):
    # TODO: use "git clone" or "git lfs clone" according to git version
    # TODO: print stderr when subprocess fails
    run_subprocess(
        f'git clone {repo_url}'.split(),
        local_dir,
        True,
    )


 def git_checkout(
    local_dir: str,
    revsion: str,
 ):
    run_subprocess(f'git checkout {revsion}'.split(), local_dir)


 def git_add(local_dir: str, ):
    run_subprocess(
        'git add .'.split(),
        local_dir,
        True,
    )


 def git_commit(local_dir: str, commit_message: str):
    run_subprocess(
        'git commit -v -m'.split() + [commit_message],
        local_dir,
        True,
    )


 def git_push(local_dir: str, branch: str):
    # check current branch
    cur_branch = git_current_branch(local_dir)
    if cur_branch != branch:
        logger.error(
            "You're trying to push to a different branch, please double check")
        return

    run_subprocess(
        f'git push origin {branch}'.split(),
        local_dir,
        True,
    )


 def git_current_branch(local_dir: str) -> Union[str, None]:
    """
    Get current branch name

    Args:
        local_dir(`str`): local model repo directory

    Returns
        branch name you're currently on
    """
    try:
        process = run_subprocess(
            'git rev-parse --abbrev-ref HEAD'.split(),
            local_dir,
            True,
        )

        return str(process.stdout).strip()
    except Exception as e:
        raise e
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -0,0 +1,173 @@
 import os
 import subprocess
 from pathlib import Path
 from typing import Optional, Union

 from modelscope.utils.logger import get_logger
 from .api import ModelScopeConfig
 from .constants import MODELSCOPE_URL_SCHEME
 from .git import git_add, git_checkout, git_clone, git_commit, git_push
 from .utils._subprocess import run_subprocess
 from .utils.utils import get_gitlab_domain

 logger = get_logger()


 class Repository:

    def __init__(
        self,
        local_dir: str,
        clone_from: Optional[str] = None,
        auth_token: Optional[str] = None,
        private: Optional[bool] = False,
        revision: Optional[str] = 'master',
    ):
        """
        Instantiate a Repository object by cloning the remote ModelScopeHub repo
        Args:
            local_dir(`str`):
                local directory to store the model files
            clone_from(`Optional[str] = None`):
                model id in ModelScope-hub from which git clone
                You should ignore this parameter when `local_dir` is already a git repo
            auth_token(`Optional[str]`):
                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                as the token is already saved when you login the first time
            private(`Optional[bool]`):
                whether the model is private, default to False
            revision(`Optional[str]`):
                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
        """
        logger.info('Instantiating Repository object...')

        # Create local directory if not exist
        os.makedirs(local_dir, exist_ok=True)
        self.local_dir = os.path.join(os.getcwd(), local_dir)

        self.private = private

        # Check git and git-lfs installation
        self.check_git_versions()

        # Retrieve auth token
        if not private and isinstance(auth_token, str):
            logger.warning(
                'cloning a public repo with a token, which will be ignored')
            self.token = None
        else:
            if isinstance(auth_token, str):
                self.token = auth_token
            else:
                self.token = ModelScopeConfig.get_token()

            if self.token is None:
                raise EnvironmentError(
                    'Token does not exist, the clone will fail for private repo.'
                    'Please login first.')

        # git clone
        if clone_from is not None:
            self.model_id = clone_from
            logger.info('cloning model repo to %s ...', self.local_dir)
            git_clone(self.local_dir, self.get_repo_url())
        else:
            if is_git_repo(self.local_dir):
                logger.debug('[Repository] is a valid git repo')
            else:
                raise ValueError(
                    'If not specifying `clone_from`, you need to pass Repository a'
                    ' valid git clone.')

        # git checkout
        if isinstance(revision, str) and revision != 'master':
            git_checkout(revision)

    def push_to_hub(self,
                    commit_message: str,
                    revision: Optional[str] = 'master'):
        """
        Push changes changes to hub

        Args:
            commit_message(`str`):
                commit message describing the changes, it's mandatory
            revision(`Optional[str]`):
                remote branch you want to push to, default to `master`

        <Tip>
            The function complains when local and remote branch are different, please be careful
        </Tip>

        """
        git_add(self.local_dir)
        git_commit(self.local_dir, commit_message)

        logger.info('Pushing changes to repo...')
        git_push(self.local_dir, revision)

        # TODO: if git push fails, how to retry?

    def check_git_versions(self):
        """
        Checks that `git` and `git-lfs` can be run.

        Raises:
            `EnvironmentError`: if `git` or `git-lfs` are not installed.
        """
        try:
            git_version = run_subprocess('git --version'.split(),
                                         self.local_dir).stdout.strip()
        except FileNotFoundError:
            raise EnvironmentError(
                'Looks like you do not have git installed, please install.')

        try:
            lfs_version = run_subprocess('git-lfs --version'.split(),
                                         self.local_dir).stdout.strip()
        except FileNotFoundError:
            raise EnvironmentError(
                'Looks like you do not have git-lfs installed, please install.'
                ' You can install from https://git-lfs.github.com/.'
                ' Then run `git lfs install` (you only have to do this once).')
        logger.info(git_version + '\n' + lfs_version)

    def get_repo_url(self) -> str:
        """
        Get repo url to clone, according whether the repo is private or not
        """
        url = None

        if self.private:
            url = f'{MODELSCOPE_URL_SCHEME}oauth2:{self.token}@{get_gitlab_domain()}/{self.model_id}'
        else:
            url = f'{MODELSCOPE_URL_SCHEME}{get_gitlab_domain()}/{self.model_id}'

        if not url:
            raise ValueError(
                'Empty repo url, please check clone_from parameter')

        logger.debug('url to clone: %s', str(url))

        return url


 def is_git_repo(folder: Union[str, Path]) -> bool:
    """
    Check if the folder is the root or part of a git repository

    Args:
        folder (`str`):
            The folder in which to run the command.

    Returns:
        `bool`: `True` if the repository is part of a repository, `False`
        otherwise.
    """
    folder_exists = os.path.exists(os.path.join(folder, '.git'))
    git_branch = subprocess.run(
        'git branch'.split(),
        cwd=folder,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE)
    return folder_exists and git_branch.returncode == 0
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -0,0 +1,125 @@
 import os
 import tempfile
 from glob import glob
 from pathlib import Path
 from typing import Dict, Optional, Union

 from modelscope.utils.logger import get_logger
 from .api import HubApi, ModelScopeConfig
 from .constants import DEFAULT_MODELSCOPE_GROUP, MODEL_ID_SEPARATOR
 from .errors import NotExistError, RequestError, raise_on_error
 from .file_download import (get_file_download_url, http_get_file,
                            http_user_agent)
 from .utils.caching import ModelFileSystemCache
 from .utils.utils import get_cache_dir, model_id_to_group_owner_name

 logger = get_logger()


 def snapshot_download(model_id: str,
                      revision: Optional[str] = 'master',
                      cache_dir: Union[str, Path, None] = None,
                      user_agent: Optional[Union[Dict, str]] = None,
                      local_files_only: Optional[bool] = False,
                      private: Optional[bool] = False) -> str:
    """Download all files of a repo.
    Downloads a whole snapshot of a repo's files at the specified revision. This
    is useful when you want all files from a repo, because you don't know which
    ones you will need a priori. All files are nested inside a folder in order
    to keep their actual filename relative to that folder.

    An alternative would be to just clone a repo but this would require that the
    user always has git and git-lfs installed, and properly configured.
    Args:
        model_id (`str`):
            A user or an organization name and a repo name separated by a `/`.
        revision (`str`, *optional*):
            An optional Git revision id which can be a branch name, a tag, or a
            commit hash. NOTE: currently only branch and tag name is supported
        cache_dir (`str`, `Path`, *optional*):
            Path to the folder where cached files are stored.
        user_agent (`str`, `dict`, *optional*):
            The user-agent info in the form of a dictionary or a string.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, avoid downloading the file and return the path to the
            local cached file if it exists.
    Returns:
        Local folder path (string) of repo snapshot

    <Tip>
    Raises the following errors:
    - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
      if `use_auth_token=True` and the token cannot be found.
    - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
      ETag cannot be determined.
    - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
      if some parameter value is invalid
    </Tip>
    """

    if cache_dir is None:
        cache_dir = get_cache_dir()
    if isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)

    group_or_owner, name = model_id_to_group_owner_name(model_id)

    cache = ModelFileSystemCache(cache_dir, group_or_owner, name)
    if local_files_only:
        if len(cache.cached_files) == 0:
            raise ValueError(
                'Cannot find the requested files in the cached path and outgoing'
                ' traffic has been disabled. To enable model look-ups and downloads'
                " online, set 'local_files_only' to False.")
        logger.warn('We can not confirm the cached file is for revision: %s'
                    % revision)
        return cache.get_root_location(
        )  # we can not confirm the cached file is for snapshot 'revision'
    else:
        # make headers
        headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
        _api = HubApi()
        # get file list from model repo
        branches, tags = _api.get_model_branches_and_tags(model_id)
        if revision not in branches and revision not in tags:
            raise NotExistError('The specified branch or tag : %s not exist!'
                                % revision)

        model_files = _api.get_model_files(
            model_id=model_id,
            revision=revision,
            recursive=True,
            use_cookies=private)

        cookies = None
        if private:
            cookies = ModelScopeConfig.get_cookies()

        for model_file in model_files:
            if model_file['Type'] == 'tree':
                continue
            # check model_file is exist in cache, if exist, skip download, otherwise download
            if cache.exists(model_file):
                logger.info(
                    'The specified file is in cache, skip downloading!')
                continue

            # get download url
            url = get_file_download_url(
                model_id=model_id,
                file_path=model_file['Path'],
                revision=revision)

            # First download to /tmp
            http_get_file(
                url=url,
                local_dir=tempfile.gettempdir(),
                file_name=model_file['Name'],
                headers=headers,
                cookies=None if cookies is None else cookies.get_dict())
            # put file to cache
            cache.put_file(
                model_file,
                os.path.join(tempfile.gettempdir(), model_file['Name']))

        return os.path.join(cache.get_root_location())
--- a/modelscope/hub/utils/init.py
+++ b/modelscope/hub/utils/init.py
--- a/modelscope/hub/utils/_subprocess.py
+++ b/modelscope/hub/utils/_subprocess.py
@@ -0,0 +1,40 @@
 import subprocess
 from typing import List


 def run_subprocess(command: List[str],
                   folder: str,
                   check=True,
                   **kwargs) -> subprocess.CompletedProcess:
    """
    Method to run subprocesses. Calling this will capture the `stderr` and `stdout`,
    please call `subprocess.run` manually in case you would like for them not to
    be captured.

    Args:
        command (`List[str]`):
            The command to execute as a list of strings.
        folder (`str`):
            The folder in which to run the command.
        check (`bool`, *optional*, defaults to `True`):
            Setting `check` to `True` will raise a `subprocess.CalledProcessError`
            when the subprocess has a non-zero exit code.
        kwargs (`Dict[str]`):
            Keyword arguments to be passed to the `subprocess.run` underlying command.

    Returns:
        `subprocess.CompletedProcess`: The completed process.
    """
    if isinstance(command, str):
        raise ValueError(
            '`run_subprocess` should be called with a list of strings.')

    return subprocess.run(
        command,
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE,
        check=check,
        encoding='utf-8',
        cwd=folder,
        **kwargs,
    )
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -0,0 +1,294 @@
 import hashlib
 import logging
 import os
 import pickle
 import tempfile
 import time
 from shutil import move, rmtree

 from modelscope.utils.logger import get_logger

 logger = get_logger()


 class FileSystemCache(object):
    KEY_FILE_NAME = '.msc'
    """Local file cache.
    """

    def __init__(
        self,
        cache_root_location: str,
        **kwargs,
    ):
        """
        Parameters
        ----------
        cache_location: str
            The root location to store files.
        """
        os.makedirs(cache_root_location, exist_ok=True)
        self.cache_root_location = cache_root_location
        self.load_cache()

    def get_root_location(self):
        return self.cache_root_location

    def load_cache(self):
        """Read set of stored blocks from file
        Args:
            owner(`str`): individual or group username at modelscope, can be empty for official models
            name(`str`): name of the model
        Returns:
            The model details information.
        Raises:
            NotExistError: If the model is not exist, will throw NotExistError
            TODO: Error based error code.
        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        self.cached_files = []
        cache_keys_file_path = os.path.join(self.cache_root_location,
                                            FileSystemCache.KEY_FILE_NAME)
        if os.path.exists(cache_keys_file_path):
            with open(cache_keys_file_path, 'rb') as f:
                self.cached_files = pickle.load(f)

    def save_cached_files(self):
        """Save cache metadata."""
        # save new meta to tmp and move to KEY_FILE_NAME
        cache_keys_file_path = os.path.join(self.cache_root_location,
                                            FileSystemCache.KEY_FILE_NAME)
        # TODO: Sync file write
        fd, fn = tempfile.mkstemp()
        with open(fd, 'wb') as f:
            pickle.dump(self.cached_files, f)
        move(fn, cache_keys_file_path)

    def get_file(self, key):
        """Check the key is in the cache, if exist, return the file, otherwise return None.
        Args:
            key(`str`): The cache key.
        Returns:
            If file exist, return the cached file location, otherwise None.
        Raises:
            None
        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        pass

    def put_file(self, key, location):
        """Put file to the cache,
        Args:
            key(`str`): The cache key
            location(`str`): Location of the file, we will move the file to cache.
        Returns:
            The cached file path of the file.
        Raises:
            None
        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        pass

    def remove_key(self, key):
        """Remove cache key in index, The file is removed manually

        Args:
            key (dict): The cache key.
        """
        self.cached_files.remove(key)
        self.save_cached_files()

    def exists(self, key):
        for cache_file in self.cached_files:
            if cache_file == key:
                return True

        return False

    def clear_cache(self):
        """Remove all files and metadat from the cache

        In the case of multiple cache locations, this clears only the last one,
        which is assumed to be the read/write one.
        """
        rmtree(self.cache_root_location)
        self.load_cache()

    def hash_name(self, key):
        return hashlib.sha256(key.encode()).hexdigest()


 class ModelFileSystemCache(FileSystemCache):
    """Local cache file layout
       cache_root/owner/model_name/|individual cached files
                                   |.mk: file, The cache index file
       Save only one version for each file.
    """

    def __init__(self, cache_root, owner, name):
        """Put file to the cache
        Args:
            cache_root(`str`): The modelscope local cache root(default: ~/.modelscope/cache/models/)
            owner(`str`): The model owner.
            name('str'): The name of the model
            branch('str'): The branch of model
            tag('str'): The tag of model
        Returns:
        Raises:
            None
        <Tip>
            model_id = {owner}/{name}
        </Tip>
        """
        super().__init__(os.path.join(cache_root, owner, name))

    def get_file_by_path(self, file_path):
        """Retrieve the cache if there is file match the path.
        Args:
            file_path (str): The file path in the model.
        Returns:
            path: the full path of the file.
        """
        for cached_file in self.cached_files:
            if file_path == cached_file['Path']:
                cached_file_path = os.path.join(self.cache_root_location,
                                                cached_file['Path'])
                if os.path.exists(cached_file_path):
                    return cached_file_path
                else:
                    self.remove_key(cached_file)

        return None

    def get_file_by_path_and_commit_id(self, file_path, commit_id):
        """Retrieve the cache if there is file match the path.
        Args:
            file_path (str): The file path in the model.
            commit_id (str): The commit id of the file
        Returns:
            path: the full path of the file.
        """
        for cached_file in self.cached_files:
            if file_path == cached_file['Path'] and \
               (cached_file['Revision'].startswith(commit_id) or commit_id.startswith(cached_file['Revision'])):
                cached_file_path = os.path.join(self.cache_root_location,
                                                cached_file['Path'])
                if os.path.exists(cached_file_path):
                    return cached_file_path
                else:
                    self.remove_key(cached_file)

        return None

    def get_file_by_info(self, model_file_info):
        """Check if exist cache file.

        Args:
            model_file_info (ModelFileInfo): The file information of the file.

        Returns:
            _type_: _description_
        """
        cache_key = self.__get_cache_key(model_file_info)
        for cached_file in self.cached_files:
            if cached_file == cache_key:
                orig_path = os.path.join(self.cache_root_location,
                                         cached_file['Path'])
                if os.path.exists(orig_path):
                    return orig_path
                else:
                    self.remove_key(cached_file)

        return None

    def __get_cache_key(self, model_file_info):
        cache_key = {
            'Path': model_file_info['Path'],
            'Revision': model_file_info['Revision'],  # commit id
        }
        return cache_key

    def exists(self, model_file_info):
        """Check the file is cached or not.

        Args:
            model_file_info (CachedFileInfo): The cached file info

        Returns:
            bool: If exists return True otherwise False
        """
        key = self.__get_cache_key(model_file_info)
        is_exists = False
        for cached_key in self.cached_files:
            if cached_key['Path'] == key['Path'] and (
                    cached_key['Revision'].startswith(key['Revision'])
                    or key['Revision'].startswith(cached_key['Revision'])):
                is_exists = True
        file_path = os.path.join(self.cache_root_location,
                                 model_file_info['Path'])
        if is_exists:
            if os.path.exists(file_path):
                return True
            else:
                self.remove_key(
                    model_file_info)  # sameone may manual delete the file
        return False

    def remove_if_exists(self, model_file_info):
        """We in cache, remove it.

        Args:
            model_file_info (ModelFileInfo): The model file information from server.
        """
        for cached_file in self.cached_files:
            if cached_file['Path'] == model_file_info['Path']:
                self.remove_key(cached_file)
                file_path = os.path.join(self.cache_root_location,
                                         cached_file['Path'])
                if os.path.exists(file_path):
                    os.remove(file_path)

    def put_file(self, model_file_info, model_file_location):
        """Put model on model_file_location to cache, the model first download to /tmp, and move to cache.

        Args:
            model_file_info (str): The file description returned by get_model_files
                                      sample:
                                    {
                                        "CommitMessage": "add model\n",
                                        "CommittedDate": 1654857567,
                                        "CommitterName": "mulin.lyh",
                                        "IsLFS": false,
                                        "Mode": "100644",
                                        "Name": "resnet18.pth",
                                        "Path": "resnet18.pth",
                                        "Revision": "09b68012b27de0048ba74003690a890af7aff192",
                                        "Size": 46827520,
                                        "Type": "blob"
                                    }
            model_file_location (str): The location of the temporary file.
        Raises:
            NotImplementedError: _description_

        Returns:
            str: The location of the cached file.
        """
        self.remove_if_exists(model_file_info)  # backup old revision
        cache_key = self.__get_cache_key(model_file_info)
        cache_full_path = os.path.join(
            self.cache_root_location,
            cache_key['Path'])  # Branch and Tag do not have same name.
        cache_file_dir = os.path.dirname(cache_full_path)
        if not os.path.exists(cache_file_dir):
            os.makedirs(cache_file_dir, exist_ok=True)
        # We can't make operation transaction
        move(model_file_location, cache_full_path)
        self.cached_files.append(cache_key)
        self.save_cached_files()
        return cache_full_path
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -0,0 +1,39 @@
 import os

 from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
                                      DEFAULT_MODELSCOPE_GITLAB_DOMAIN,
                                      DEFAULT_MODELSCOPE_GROUP,
                                      MODEL_ID_SEPARATOR,
                                      MODELSCOPE_URL_SCHEME)


 def model_id_to_group_owner_name(model_id):
    if MODEL_ID_SEPARATOR in model_id:
        group_or_owner = model_id.split(MODEL_ID_SEPARATOR)[0]
        name = model_id.split(MODEL_ID_SEPARATOR)[1]
    else:
        group_or_owner = DEFAULT_MODELSCOPE_GROUP
        name = model_id
    return group_or_owner, name


 def get_cache_dir():
    """
    cache dir precedence:
        function parameter > enviroment > ~/.cache/modelscope/hub
    """
    default_cache_dir = os.path.expanduser(
        os.path.join('~/.cache', 'modelscope'))
    return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
                                                      'hub'))


 def get_endpoint():
    modelscope_domain = os.getenv('MODELSCOPE_DOMAIN',
                                  DEFAULT_MODELSCOPE_DOMAIN)
    return MODELSCOPE_URL_SCHEME + modelscope_domain


 def get_gitlab_domain():
    return os.getenv('MODELSCOPE_GITLAB_DOMAIN',
                     DEFAULT_MODELSCOPE_GITLAB_DOMAIN)
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -0,0 +1,104 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.


 class Models(object):
    """ Names for different models.

        Holds the standard model name to use for identifying different model.
    This should be used to register models.

        Model name should only contain model info but not task info.
    """
    # vision models

    # nlp models
    bert = 'bert'
    palm = 'palm-v2'
    structbert = 'structbert'
    veco = 'veco'

    # audio models
    sambert_hifi_16k = 'sambert-hifi-16k'
    generic_tts_frontend = 'generic-tts-frontend'
    hifigan16k = 'hifigan16k'

    # multi-modal models
    ofa = 'ofa'


 class Pipelines(object):
    """ Names for different pipelines.

        Holds the standard pipline name to use for identifying different pipeline.
    This should be used to register pipelines.

        For pipeline which support different models and implements the common function, we
    should use task name for this pipeline.
        For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
    """
    # vision tasks
    image_matting = 'unet-image-matting'
    person_image_cartoon = 'unet-person-image-cartoon'
    ocr_detection = 'resnet18-ocr-detection'
    action_recognition = 'TAdaConv_action-recognition'

    # nlp tasks
    sentence_similarity = 'sentence-similarity'
    word_segmentation = 'word-segmentation'
    text_generation = 'text-generation'
    sentiment_analysis = 'sentiment-analysis'
    sentiment_classification = 'sentiment-classification'
    fill_mask = 'fill-mask'
    nli = 'nli'
    dialog_intent_prediction = 'dialog-intent-prediction'
    dialog_modeling = 'dialog-modeling'
    dialog_state_tracking = 'dialog_state_tracking'

    # audio tasks
    sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
    speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'

    # multi-modal tasks
    image_caption = 'image-caption'


 class Trainers(object):
    """ Names for different trainer.

        Holds the standard trainer name to use for identifying different trainer.
    This should be used to register trainers.

        For a general Trainer, you can use easynlp-trainer/ofa-trainer/sofa-trainer.
        For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
    """

    default = 'Trainer'


 class Preprocessors(object):
    """ Names for different preprocessor.

        Holds the standard preprocessor name to use for identifying different preprocessor.
    This should be used to register preprocessors.

        For a general preprocessor, just use the function name as preprocessor name such as
    resize-image, random-crop
        For a model-specific preprocessor, use ${modelname}-${fuction}
    """

    # cv preprocessor
    load_image = 'load-image'

    # nlp preprocessor
    bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
    palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
    token_cls_tokenizer = 'token-cls-tokenizer'
    nli_tokenizer = 'nli-tokenizer'
    sen_cls_tokenizer = 'sen-cls-tokenizer'

    # audio preprocessor
    linear_aec_fbank = 'linear-aec-fbank'
    text_to_tacotron_symbols = 'text-to-tacotron-symbols'

    # multi-modal
    ofa_image_caption = 'ofa-image-caption'
--- a/modelscope/models/init.py
+++ b/modelscope/models/init.py
@@ -1,7 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .audio.tts.am import SambertNetHifi16k
 from .audio.tts.vocoder import Hifigan16k
 # from .audio.tts.am import SambertNetHifi16k
 # from .audio.tts.vocoder import Hifigan16k
 from .base import Model
 from .builder import MODELS, build_model
 from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
 # from .multi_model import OfaForImageCaptioning
 from .nlp import (BertForSequenceClassification, SbertForNLI,
                  SbertForSentenceSimilarity, SbertForSentimentClassification,
                  SbertForTokenClassification, StructBertForMaskedLM,
                  VecoForMaskedLM)
--- a/modelscope/models/audio/tts/am/sambert_hifi_16k.py
+++ b/modelscope/models/audio/tts/am/sambert_hifi_16k.py
@@ -6,6 +6,7 @@ import numpy as np
 import tensorflow as tf
 from sklearn.preprocessing import MultiLabelBinarizer

 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
@@ -17,7 +18,7 @@ __all__ = ['SambertNetHifi16k']


 def multi_label_symbol_to_sequence(my_classes, my_symbol):
    one_hot = MultiLabelBinarizer(my_classes)
    one_hot = MultiLabelBinarizer(classes=my_classes)
    tokens = my_symbol.strip().split(' ')
    sequences = []
    for token in tokens:
@@ -26,7 +27,8 @@ def multi_label_symbol_to_sequence(my_classes, my_symbol):
    return one_hot.fit_transform(sequences)


@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k')
@MODELS.register_module(
    Tasks.text_to_speech, module_name=Models.sambert_hifi_16k)
 class SambertNetHifi16k(Model):

    def __init__(self,
--- a/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
+++ b/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
@@ -2,8 +2,7 @@ import os
 import zipfile
 from typing import Any, Dict, List

 import ttsfrd

 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
@@ -15,11 +14,12 @@ __all__ = ['GenericTtsFrontend']


@MODELS.register_module(
    Tasks.text_to_speech, module_name=r'generic_tts_frontend')
    Tasks.text_to_speech, module_name=Models.generic_tts_frontend)
 class GenericTtsFrontend(Model):

    def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        import ttsfrd
        frontend = ttsfrd.TtsFrontendEngine()
        zip_file = os.path.join(model_dir, 'resource.zip')
        self._res_path = os.path.join(model_dir, 'resource')
--- a/modelscope/models/audio/tts/vocoder/hifigan16k.py
+++ b/modelscope/models/audio/tts/vocoder/hifigan16k.py
@@ -10,6 +10,7 @@ import numpy as np
 import torch
 from scipy.io.wavfile import write

 from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import \
@@ -36,7 +37,7 @@ class AttrDict(dict):
        self.__dict__ = self


@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k')
@MODELS.register_module(Tasks.text_to_speech, module_name=Models.hifigan16k)
 class Hifigan16k(Model):

    def __init__(self, model_dir, *args, **kwargs):
--- a/modelscope/models/audio/tts/vocoder/models/models.py
+++ b/modelscope/models/audio/tts/vocoder/models/models.py
@@ -3,7 +3,6 @@ from distutils.version import LooseVersion
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from pytorch_wavelets import DWT1DForward
 from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm

@@ -357,6 +356,7 @@ class MultiScaleDiscriminator(torch.nn.Module):
            DiscriminatorS(),
            DiscriminatorS(),
        ])
        from pytorch_wavelets import DWT1DForward
        self.meanpools = nn.ModuleList(
            [DWT1DForward(wave='db3', J=1),
             DWT1DForward(wave='db3', J=1)])
--- a/modelscope/models/base.py
+++ b/modelscope/models/base.py
@@ -4,12 +4,13 @@ import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Dict, Union

 from maas_hub.snapshot_download import snapshot_download

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.logger import get_logger

 logger = get_logger()

 Tensor = Union['torch.Tensor', 'tf.Tensor']

@@ -47,21 +48,25 @@ class Model(ABC):
        if osp.exists(model_name_or_path):
            local_model_dir = model_name_or_path
        else:
            cache_path = get_model_cache_dir(model_name_or_path)
            local_model_dir = cache_path if osp.exists(
                cache_path) else snapshot_download(model_name_or_path)
            # else:
            #     raise ValueError(
            #         'Remote model repo {model_name_or_path} does not exists')

            local_model_dir = snapshot_download(model_name_or_path)
        logger.info(f'initialize model from {local_model_dir}')
        cfg = Config.from_file(
            osp.join(local_model_dir, ModelFile.CONFIGURATION))
        task_name = cfg.task
        model_cfg = cfg.model
        assert hasattr(
            cfg, 'pipeline'), 'pipeline config is missing from config file.'
        pipeline_cfg = cfg.pipeline
        # TODO @wenmeng.zwm may should manually initialize model after model building
        if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
            model_cfg.type = model_cfg.model_type

        model_cfg.model_dir = local_model_dir

        for k, v in kwargs.items():
            model_cfg.k = v
        return build_model(model_cfg, task_name)
        model = build_model(model_cfg, task_name)

        # dynamically add pipeline info to model for pipeline inference
        model.pipeline = pipeline_cfg
        return model
--- a/modelscope/models/cv/action_recognition/init.py
+++ b/modelscope/models/cv/action_recognition/init.py
--- a/modelscope/models/cv/action_recognition/models.py
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -0,0 +1,91 @@
 import torch
 import torch.nn as nn

 from .tada_convnext import TadaConvNeXt


 class BaseVideoModel(nn.Module):
    """
    Standard video model.
    The model is divided into the backbone and the head, where the backbone
    extracts features and the head performs classification.

    The backbones can be defined in model/base/backbone.py or anywhere else
    as long as the backbone is registered by the BACKBONE_REGISTRY.
    The heads can be defined in model/module_zoo/heads/ or anywhere else
    as long as the head is registered by the HEAD_REGISTRY.

    The registries automatically finds the registered modules and construct
    the base video model.
    """

    def __init__(self, cfg):
        """
        Args:
            cfg (Config): global config object.
        """
        super(BaseVideoModel, self).__init__()
        # the backbone is created according to meta-architectures
        # defined in models/base/backbone.py
        self.backbone = TadaConvNeXt(cfg)

        # the head is created according to the heads
        # defined in models/module_zoo/heads
        self.head = BaseHead(cfg)

    def forward(self, x):
        x = self.backbone(x)
        x = self.head(x)
        return x


 class BaseHead(nn.Module):
    """
    Constructs base head.
    """

    def __init__(
        self,
        cfg,
    ):
        """
        Args:
            cfg (Config): global config object.
        """
        super(BaseHead, self).__init__()
        self.cfg = cfg
        dim = cfg.VIDEO.BACKBONE.NUM_OUT_FEATURES
        num_classes = cfg.VIDEO.HEAD.NUM_CLASSES
        dropout_rate = cfg.VIDEO.HEAD.DROPOUT_RATE
        activation_func = cfg.VIDEO.HEAD.ACTIVATION
        self._construct_head(dim, num_classes, dropout_rate, activation_func)

    def _construct_head(self, dim, num_classes, dropout_rate, activation_func):
        self.global_avg_pool = nn.AdaptiveAvgPool3d(1)

        if dropout_rate > 0.0:
            self.dropout = nn.Dropout(dropout_rate)

        self.out = nn.Linear(dim, num_classes, bias=True)

        if activation_func == 'softmax':
            self.activation = nn.Softmax(dim=-1)
        elif activation_func == 'sigmoid':
            self.activation = nn.Sigmoid()
        else:
            raise NotImplementedError('{} is not supported as an activation'
                                      'function.'.format(activation_func))

    def forward(self, x):
        if len(x.shape) == 5:
            x = self.global_avg_pool(x)
            # (N, C, T, H, W) -> (N, T, H, W, C).
            x = x.permute((0, 2, 3, 4, 1))
        if hasattr(self, 'dropout'):
            out = self.dropout(x)
        else:
            out = x
        out = self.out(out)
        out = self.activation(out)
        out = out.view(out.shape[0], -1)
        return out, x.view(x.shape[0], -1)
--- a/modelscope/models/cv/action_recognition/tada_convnext.py
+++ b/modelscope/models/cv/action_recognition/tada_convnext.py
@@ -0,0 +1,472 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.modules.utils import _pair, _triple


 def drop_path(x, drop_prob: float = 0., training: bool = False):
    """
    From https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py.
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0], ) + (1, ) * (
        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(
        shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


 class DropPath(nn.Module):
    """
    From https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py.
    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


 class TadaConvNeXt(nn.Module):
    r""" ConvNeXt
        A PyTorch impl of : `A ConvNet for the 2020s`  -
          https://arxiv.org/pdf/2201.03545.pdf

    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    """

    def __init__(
        self, cfg
        #  in_chans=3, num_classes=1000,
        #  depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
        #  layer_scale_init_value=1e-6, head_init_scale=1.,
    ):
        super().__init__()
        in_chans = cfg.VIDEO.BACKBONE.NUM_INPUT_CHANNELS
        dims = cfg.VIDEO.BACKBONE.NUM_FILTERS
        drop_path_rate = cfg.VIDEO.BACKBONE.DROP_PATH
        depths = cfg.VIDEO.BACKBONE.DEPTH
        layer_scale_init_value = cfg.VIDEO.BACKBONE.LARGE_SCALE_INIT_VALUE
        stem_t_kernel_size = cfg.VIDEO.BACKBONE.STEM.T_KERNEL_SIZE if hasattr(
            cfg.VIDEO.BACKBONE.STEM, 'T_KERNEL_SIZE') else 2
        t_stride = cfg.VIDEO.BACKBONE.STEM.T_STRIDE if hasattr(
            cfg.VIDEO.BACKBONE.STEM, 'T_STRIDE') else 2

        self.downsample_layers = nn.ModuleList(
        )  # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv3d(
                in_chans,
                dims[0],
                kernel_size=(stem_t_kernel_size, 4, 4),
                stride=(t_stride, 4, 4),
                padding=((stem_t_kernel_size - 1) // 2, 0, 0)),
            LayerNorm(dims[0], eps=1e-6, data_format='channels_first'))
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                LayerNorm(dims[i], eps=1e-6, data_format='channels_first'),
                nn.Conv3d(
                    dims[i],
                    dims[i + 1],
                    kernel_size=(1, 2, 2),
                    stride=(1, 2, 2)),
            )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.ModuleList(
        )  # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates = [
            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
        ]
        cur = 0
        for i in range(4):
            stage = nn.Sequential(*[
                TAdaConvNeXtBlock(
                    cfg,
                    dim=dims[i],
                    drop_path=dp_rates[cur + j],
                    layer_scale_init_value=layer_scale_init_value)
                for j in range(depths[i])
            ])
            self.stages.append(stage)
            cur += depths[i]

        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer

    def forward_features(self, x):
        for i in range(4):
            x = self.downsample_layers[i](x)
            x = self.stages[i](x)
        return self.norm(x.mean(
            [-3, -2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)

    def forward(self, x):
        if isinstance(x, dict):
            x = x['video']
        x = self.forward_features(x)
        return x

    def get_num_layers(self):
        return 12, 0


 class ConvNeXtBlock(nn.Module):
    r""" ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch

    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
    """

    def __init__(self, cfg, dim, drop_path=0., layer_scale_init_value=1e-6):
        super().__init__()
        self.dwconv = nn.Conv3d(
            dim, dim, kernel_size=(1, 7, 7), padding=(0, 3, 3),
            groups=dim)  # depthwise conv
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(
            dim,
            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(4 * dim, dim)
        self.gamma = nn.Parameter(
            layer_scale_init_value * torch.ones((dim)),
            requires_grad=True) if layer_scale_init_value > 0 else None
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
        input = x
        x = self.dwconv(x)
        x = x.permute(0, 2, 3, 4, 1)  # (N, C, T, H, W) -> (N, T, H, W, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.permute(0, 4, 1, 2, 3)  # (N, T, H, W, C) -> (N, C, T, H, W)

        x = input + self.drop_path(x)
        return x


 class LayerNorm(nn.Module):
    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
    with shape (batch_size, channels, height, width).
    """

    def __init__(self,
                 normalized_shape,
                 eps=1e-6,
                 data_format='channels_last'):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))
        self.eps = eps
        self.data_format = data_format
        if self.data_format not in ['channels_last', 'channels_first']:
            raise NotImplementedError
        self.normalized_shape = (normalized_shape, )

    def forward(self, x):
        if self.data_format == 'channels_last':
            return F.layer_norm(x, self.normalized_shape, self.weight,
                                self.bias, self.eps)
        elif self.data_format == 'channels_first':
            u = x.mean(1, keepdim=True)
            s = (x - u).pow(2).mean(1, keepdim=True)
            x = (x - u) / torch.sqrt(s + self.eps)
            x = self.weight[:, None, None, None] * x + self.bias[:, None, None,
                                                                 None]
            return x


 class TAdaConvNeXtBlock(nn.Module):
    r""" ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_fi rst) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch

    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
    """

    def __init__(self, cfg, dim, drop_path=0., layer_scale_init_value=1e-6):
        super().__init__()
        layer_scale_init_value = float(layer_scale_init_value)
        self.dwconv = TAdaConv2d(
            dim,
            dim,
            kernel_size=(1, 7, 7),
            padding=(0, 3, 3),
            groups=dim,
            cal_dim='cout')
        route_func_type = cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_TYPE
        if route_func_type == 'normal':
            self.dwconv_rf = RouteFuncMLP(
                c_in=dim,
                ratio=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R,
                kernels=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K,
                with_bias_cal=self.dwconv.bias is not None)
        elif route_func_type == 'normal_lngelu':
            self.dwconv_rf = RouteFuncMLPLnGelu(
                c_in=dim,
                ratio=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R,
                kernels=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K,
                with_bias_cal=self.dwconv.bias is not None)
        else:
            raise ValueError(
                'Unknown route_func_type: {}'.format(route_func_type))
        self.norm = LayerNorm(dim, eps=1e-6)
        self.pwconv1 = nn.Linear(
            dim,
            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(4 * dim, dim)
        self.gamma = nn.Parameter(
            layer_scale_init_value * torch.ones((dim)),
            requires_grad=True) if layer_scale_init_value > 0 else None
        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else nn.Identity()

    def forward(self, x):
        input = x
        x = self.dwconv(x, self.dwconv_rf(x))
        x = x.permute(0, 2, 3, 4, 1)  # (N, C, T, H, W) -> (N, T, H, W, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.permute(0, 4, 1, 2, 3)  # (N, T, H, W, C) -> (N, C, T, H, W)

        x = input + self.drop_path(x)
        return x


 class RouteFuncMLPLnGelu(nn.Module):
    """
    The routing function for generating the calibration weights.
    """

    def __init__(self,
                 c_in,
                 ratio,
                 kernels,
                 with_bias_cal=False,
                 bn_eps=1e-5,
                 bn_mmt=0.1):
        """
        Args:
            c_in (int): number of input channels.
            ratio (int): reduction ratio for the routing function.
            kernels (list): temporal kernel size of the stacked 1D convolutions
        """
        super(RouteFuncMLPLnGelu, self).__init__()
        self.c_in = c_in
        self.with_bias_cal = with_bias_cal
        self.avgpool = nn.AdaptiveAvgPool3d((None, 1, 1))
        self.globalpool = nn.AdaptiveAvgPool3d(1)
        self.g = nn.Conv3d(
            in_channels=c_in,
            out_channels=c_in,
            kernel_size=1,
            padding=0,
        )
        self.a = nn.Conv3d(
            in_channels=c_in,
            out_channels=int(c_in // ratio),
            kernel_size=[kernels[0], 1, 1],
            padding=[kernels[0] // 2, 0, 0],
        )
        # self.bn = nn.BatchNorm3d(int(c_in//ratio), eps=bn_eps, momentum=bn_mmt)
        self.ln = LayerNorm(
            int(c_in // ratio), eps=1e-6, data_format='channels_first')
        self.gelu = nn.GELU()
        # self.relu = nn.ReLU(inplace=True)
        self.b = nn.Conv3d(
            in_channels=int(c_in // ratio),
            out_channels=c_in,
            kernel_size=[kernels[1], 1, 1],
            padding=[kernels[1] // 2, 0, 0],
            bias=False)
        self.b.skip_init = True
        self.b.weight.data.zero_()  # to make sure the initial values
        # for the output is 1.
        if with_bias_cal:
            self.b_bias = nn.Conv3d(
                in_channels=int(c_in // ratio),
                out_channels=c_in,
                kernel_size=[kernels[1], 1, 1],
                padding=[kernels[1] // 2, 0, 0],
                bias=False)
            self.b_bias.skip_init = True
            self.b_bias.weight.data.zero_()  # to make sure the initial values
            # for the output is 1.

    def forward(self, x):
        g = self.globalpool(x)
        x = self.avgpool(x)
        x = self.a(x + self.g(g))
        # x = self.bn(x)
        # x = self.relu(x)
        x = self.ln(x)
        x = self.gelu(x)
        if self.with_bias_cal:
            return [self.b(x) + 1, self.b_bias(x) + 1]
        else:
            return self.b(x) + 1


 class TAdaConv2d(nn.Module):
    """
    Performs temporally adaptive 2D convolution.
    Currently, only application on 5D tensors is supported, which makes TAdaConv2d
        essentially a 3D convolution with temporal kernel size of 1.
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=1,
                 bias=True,
                 cal_dim='cin'):
        super(TAdaConv2d, self).__init__()
        """
        Args:
            in_channels (int): number of input channels.
            out_channels (int): number of output channels.
            kernel_size (list): kernel size of TAdaConv2d.
            stride (list): stride for the convolution in TAdaConv2d.
             padding (list): padding for the convolution in TAdaConv2d.
            dilation (list): dilation of the convolution in TAdaConv2d.
            groups (int): number of groups for TAdaConv2d.
            bias (bool): whether to use bias in TAdaConv2d.
            calibration_mode (str): calibrated dimension in TAdaConv2d.
                Supported input "cin", "cout".
        """

        kernel_size = _triple(kernel_size)
        stride = _triple(stride)
        padding = _triple(padding)
        dilation = _triple(dilation)

        assert kernel_size[0] == 1
        assert stride[0] == 1
        assert padding[0] == 0
        assert dilation[0] == 1
        assert cal_dim in ['cin', 'cout']

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.groups = groups
        self.cal_dim = cal_dim

        # base weights (W_b)
        self.weight = nn.Parameter(
            torch.Tensor(1, 1, out_channels, in_channels // groups,
                         kernel_size[1], kernel_size[2]))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(1, 1, out_channels))
        else:
            self.register_parameter('bias', None)

        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, x, alpha):
        """
        Args:
            x (tensor): feature to perform convolution on.
            alpha (tensor): calibration weight for the base weights.
                W_t = alpha_t * W_b
        """
        if isinstance(alpha, list):
            w_alpha, b_alpha = alpha[0], alpha[1]
        else:
            w_alpha = alpha
            b_alpha = None
        _, _, c_out, c_in, kh, kw = self.weight.size()
        b, c_in, t, h, w = x.size()
        x = x.permute(0, 2, 1, 3, 4).reshape(1, -1, h, w)

        if self.cal_dim == 'cin':
            # w_alpha: B, C, T, H(1), W(1) -> B, T, C, H(1), W(1) -> B, T, 1, C, H(1), W(1)
            # corresponding to calibrating the input channel
            weight = (w_alpha.permute(0, 2, 1, 3, 4).unsqueeze(2)
                      * self.weight).reshape(-1, c_in // self.groups, kh, kw)
        elif self.cal_dim == 'cout':
            # w_alpha: B, C, T, H(1), W(1) -> B, T, C, H(1), W(1) -> B, T, C, 1, H(1), W(1)
            # corresponding to calibrating the input channel
            weight = (w_alpha.permute(0, 2, 1, 3, 4).unsqueeze(3)
                      * self.weight).reshape(-1, c_in // self.groups, kh, kw)

        bias = None
        if self.bias is not None:
            if b_alpha is not None:
                # b_alpha: B, C, T, H(1), W(1) -> B, T, C, H(1), W(1) -> B, T, C
                bias = (b_alpha.permute(0, 2, 1, 3, 4).squeeze()
                        * self.bias).reshape(-1)
            else:
                bias = self.bias.repeat(b, t, 1).reshape(-1)
        output = F.conv2d(
            x,
            weight=weight,
            bias=bias,
            stride=self.stride[1:],
            padding=self.padding[1:],
            dilation=self.dilation[1:],
            groups=self.groups * b * t)

        output = output.view(b, t, c_out, output.size(-2),
                             output.size(-1)).permute(0, 2, 1, 3, 4)

        return output

    def __repr__(self):
        return f'TAdaConv2d({self.in_channels}, {self.out_channels}, kernel_size={self.kernel_size}, ' +\
            f"stride={self.stride}, padding={self.padding}, bias={self.bias is not None}, cal_dim=\"{self.cal_dim}\")"
--- a/modelscope/models/multi_model/init.py
+++ b/modelscope/models/multi_model/init.py
@@ -0,0 +1 @@
 from .image_captioning_model import OfaForImageCaptioning
--- a/modelscope/models/multi_model/image_captioning_model.py
+++ b/modelscope/models/multi_model/image_captioning_model.py
@@ -0,0 +1,80 @@
 import os.path as osp
 from typing import Any, Dict

 from PIL import Image

 from modelscope.metainfo import Models
 from modelscope.utils.constant import ModelFile, Tasks
 from ..base import Model
 from ..builder import MODELS

 __all__ = ['OfaForImageCaptioning']


@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
 class OfaForImageCaptioning(Model):

    def __init__(self, model_dir, *args, **kwargs):
        super().__init__(model_dir=model_dir, *args, **kwargs)
        ckpt_name = ModelFile.TORCH_MODEL_FILE
        local_model = osp.join(model_dir, ckpt_name)
        bpe_dir = model_dir
        # turn on cuda if GPU is available
        from fairseq import checkpoint_utils, tasks, utils
        from ofa.tasks.mm_tasks import CaptionTask
        from ofa.utils.eval_utils import eval_caption
        self.eval_caption = eval_caption

        tasks.register_task('caption', CaptionTask)
        use_cuda = kwargs['use_cuda'] if 'use_cuda' in kwargs else False
        use_fp16 = kwargs[
            'use_fp16'] if 'use_fp16' in kwargs and use_cuda else False
        overrides = {
            'bpe_dir': bpe_dir,
            'eval_cider': False,
            'beam': 5,
            'max_len_b': 16,
            'no_repeat_ngram_size': 3,
            'seed': 7
        }
        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            utils.split_paths(local_model), arg_overrides=overrides)

        # Move models to GPU
        for model in models:
            model.eval()
            if use_cuda:
                model.cuda()
            if use_fp16:
                model.half()
            model.prepare_for_inference_(cfg)
        self.models = models
        # Initialize generator
        self.generator = task.build_generator(models, cfg.generation)

        # Initialize transform
        from torchvision import transforms
        mean = [0.5, 0.5, 0.5]
        std = [0.5, 0.5, 0.5]

        self.patch_resize_transform = transforms.Compose([
            lambda image: image.convert('RGB'),
            transforms.Resize(
                (cfg.task.patch_image_size, cfg.task.patch_image_size),
                interpolation=Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std),
        ])
        self.task = task

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        results, _ = self.eval_caption(self.task, self.generator, self.models,
                                       input)
        return {
            'image_id': results[0]['image_id'],
            'caption': results[0]['caption']
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        # What should we do here ?
        return inputs
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -1,6 +1,9 @@
 from .bert_for_sequence_classification import *  # noqa F403
 from .masked_language_model import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
 from .sbert_for_nli import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
 from .sbert_for_sentiment_classification import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
 from .space.dialog_intent_prediction_model import *  # noqa F403
 from .space.dialog_modeling_model import *  # noqa F403
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -4,6 +4,7 @@ from typing import Any, Dict
 import json
 import numpy as np

 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model
 from ..builder import MODELS
@@ -11,8 +12,7 @@ from ..builder import MODELS
 __all__ = ['BertForSequenceClassification']


@MODELS.register_module(
    Tasks.text_classification, module_name=r'bert-sentiment-analysis')
@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
 class BertForSequenceClassification(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
--- a/modelscope/models/nlp/masked_language_model.py
+++ b/modelscope/models/nlp/masked_language_model.py
@@ -0,0 +1,63 @@
 from typing import Any, Dict, Optional, Union

 import numpy as np

 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['StructBertForMaskedLM', 'VecoForMaskedLM']


 class MaskedLanguageModelBase(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        self.model = self.build_model()

    def build_model(self):
        raise NotImplementedError()

    def train(self):
        return self.model.train()

    def eval(self):
        return self.model.eval()

    @property
    def config(self):
        if hasattr(self.model, 'config'):
            return self.model.config
        return None

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, np.ndarray]: results
        """
        rst = self.model(
            input_ids=input['input_ids'],
            attention_mask=input['attention_mask'],
            token_type_ids=input['token_type_ids'])
        return {'logits': rst['logits'], 'input_ids': input['input_ids']}


@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
 class StructBertForMaskedLM(MaskedLanguageModelBase):

    def build_model(self):
        from sofa import SbertForMaskedLM
        return SbertForMaskedLM.from_pretrained(self.model_dir)


@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
 class VecoForMaskedLM(MaskedLanguageModelBase):

    def build_model(self):
        from sofa import VecoForMaskedLM
        return VecoForMaskedLM.from_pretrained(self.model_dir)
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_for_text_generation.py
@@ -1,13 +1,14 @@
 from typing import Dict

 from modelscope.utils.constant import Tasks
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['PalmForTextGeneration']


@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
@MODELS.register_module(Tasks.text_generation, module_name=Models.palm)
 class PalmForTextGeneration(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
@@ -19,13 +20,18 @@ class PalmForTextGeneration(Model):
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir

        from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
        model = PalmForConditionalGeneration.from_pretrained(model_dir)
        self.tokenizer = model.tokenizer
        self.generator = Translator(model)

    def train(self):
        return self.generator.train()

    def eval(self):
        return self.generator.eval()

    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """return the result by the model

--- a/modelscope/models/nlp/sbert_for_nli.py
+++ b/modelscope/models/nlp/sbert_for_nli.py
@@ -0,0 +1,23 @@
 from ...metainfo import Models
 from ...utils.constant import Tasks
 from ..builder import MODELS
 from .sbert_for_sequence_classification import \
    SbertForSequenceClassificationBase

 __all__ = ['SbertForNLI']


@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
 class SbertForNLI(SbertForSequenceClassificationBase):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(
            model_dir, *args, model_args={'num_labels': 3}, **kwargs)
        assert self.model.config.num_labels == 3
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ b/modelscope/models/nlp/sbert_for_sentence_similarity.py
@@ -1,46 +1,15 @@
 import os
 from typing import Any, Dict

 import json
 import numpy as np
 import torch
 from sofa import SbertModel
 from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
 from torch import nn

 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
 from .sbert_for_sequence_classification import \
    SbertForSequenceClassificationBase

 __all__ = ['SbertForSentenceSimilarity']


 class SbertTextClassifier(SbertPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.encoder = SbertModel(config, add_pooling_layer=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, token_type_ids=None):
        outputs = self.encoder(
            input_ids,
            token_type_ids=token_type_ids,
            return_dict=None,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


@MODELS.register_module(
    Tasks.sentence_similarity,
    module_name=r'sbert-base-chinese-sentence-similarity')
 class SbertForSentenceSimilarity(Model):
    Tasks.sentence_similarity, module_name=Models.structbert)
 class SbertForSentenceSimilarity(SbertForSequenceClassificationBase):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the sentence similarity model from the `model_dir` path.
@@ -50,39 +19,7 @@ class SbertForSentenceSimilarity(Model):
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(model_dir, *args, **kwargs)
        super().__init__(
            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
        self.model_dir = model_dir

        self.model = SbertTextClassifier.from_pretrained(
            model_dir, num_labels=2)
        self.model.eval()
        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
        with open(self.label_path) as f:
            self.label_mapping = json.load(f)
        self.id2label = {idx: name for name, idx in self.label_mapping.items()}

    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        """return the result by the model

        Args:
            input (Dict[str, Any]): the preprocessed data

        Returns:
            Dict[str, np.ndarray]: results
                Example:
                    {
                        'predictions': array([1]), # lable 0-negative 1-positive
                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
                    }
        """
        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
        token_type_ids = torch.tensor(
            input['token_type_ids'], dtype=torch.long)
        with torch.no_grad():
            logits = self.model(input_ids, token_type_ids)
        probs = logits.softmax(-1).numpy()
        pred = logits.argmax(-1).numpy()
        logits = logits.numpy()
        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
        return res
        assert self.model.config.num_labels == 2
--- a/modelscope/models/nlp/sbert_for_sentiment_classification.py
+++ b/modelscope/models/nlp/sbert_for_sentiment_classification.py
@@ -0,0 +1,24 @@
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..builder import MODELS
 from .sbert_for_sequence_classification import \
    SbertForSequenceClassificationBase

 __all__ = ['SbertForSentimentClassification']


@MODELS.register_module(
    Tasks.sentiment_classification, module_name=Models.structbert)
 class SbertForSentimentClassification(SbertForSequenceClassificationBase):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the text generation model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            model_cls (Optional[Any], optional): model loader, if None, use the
                default loader to load model weights, by default None.
        """
        super().__init__(
            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
        assert self.model.config.num_labels == 2
--- a/modelscope/models/nlp/sbert_for_sequence_classification.py
+++ b/modelscope/models/nlp/sbert_for_sequence_classification.py
@@ -0,0 +1,71 @@
 import os
 from typing import Any, Dict

 import json
 import numpy as np
 import torch
 from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel
 from torch import nn

 from ..base import Model


 class SbertTextClassfier(SbertPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.encoder = SbertModel(config, add_pooling_layer=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, token_type_ids=None):
        outputs = self.encoder(
            input_ids,
            token_type_ids=token_type_ids,
            return_dict=None,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return {'logits': logits}


 class SbertForSequenceClassificationBase(Model):

    def __init__(self, model_dir: str, model_args=None, *args, **kwargs):
        super().__init__(model_dir, *args, **kwargs)
        if model_args is None:
            model_args = {}
        self.model = SbertTextClassfier.from_pretrained(
            model_dir, **model_args)
        self.id2label = {}
        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
        if os.path.exists(self.label_path):
            with open(self.label_path) as f:
                self.label_mapping = json.load(f)
            self.id2label = {
                idx: name
                for name, idx in self.label_mapping.items()
            }

    def train(self):
        return self.model.train()

    def eval(self):
        return self.model.eval()

    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
        token_type_ids = torch.tensor(
            input['token_type_ids'], dtype=torch.long)
        return self.model.forward(input_ids, token_type_ids)

    def postprocess(self, input, **kwargs):
        logits = input['logits']
        probs = logits.softmax(-1).numpy()
        pred = logits.argmax(-1).numpy()
        logits = logits.numpy()
        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
        return res
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -2,19 +2,17 @@ from typing import Any, Dict, Union

 import numpy as np
 import torch
 from sofa import SbertConfig, SbertForTokenClassification

 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS

 __all__ = ['StructBertForTokenClassification']
 __all__ = ['SbertForTokenClassification']


@MODELS.register_module(
    Tasks.word_segmentation,
    module_name=r'structbert-chinese-word-segmentation')
 class StructBertForTokenClassification(Model):
@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
 class SbertForTokenClassification(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
        """initialize the word segmentation model from the `model_dir` path.
@@ -26,9 +24,16 @@ class StructBertForTokenClassification(Model):
        """
        super().__init__(model_dir, *args, **kwargs)
        self.model_dir = model_dir
        self.model = SbertForTokenClassification.from_pretrained(
        import sofa
        self.model = sofa.SbertForTokenClassification.from_pretrained(
            self.model_dir)
        self.config = SbertConfig.from_pretrained(self.model_dir)
        self.config = sofa.SbertConfig.from_pretrained(self.model_dir)

    def train(self):
        return self.model.train()

    def eval(self):
        return self.model.eval()

    def forward(self, input: Dict[str,
                                  Any]) -> Dict[str, Union[str, np.ndarray]]:
@@ -47,10 +52,12 @@ class StructBertForTokenClassification(Model):
                    }
        """
        input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
        output = self.model(input_ids)
        logits = output.logits
        return {**self.model(input_ids), 'text': input['text']}

    def postprocess(self, input: Dict[str, Tensor],
                    **kwargs) -> Dict[str, Tensor]:
        logits = input['logits']
        pred = torch.argmax(logits[0], dim=-1)
        pred = pred.numpy()

        rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
        return rst
--- a/modelscope/models/nlp/space/dialog_intent_prediction_model.py
+++ b/modelscope/models/nlp/space/dialog_intent_prediction_model.py
@@ -1,11 +1,10 @@
 import os
 from typing import Any, Dict

 from modelscope.preprocessors.space.fields.intent_field import \
    IntentBPETextField
 from modelscope.trainers.nlp.space.trainers.intent_trainer import IntentTrainer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Tasks
 from ....preprocessors.space.fields.intent_field import IntentBPETextField
 from ....trainers.nlp.space.trainers.intent_trainer import IntentTrainer
 from ....utils.config import Config
 from ....utils.constant import Tasks
 from ...base import Model, Tensor
 from ...builder import MODELS
 from .model.generator import Generator
@@ -14,8 +13,7 @@ from .model.model_base import ModelBase
 __all__ = ['DialogIntentModel']


@MODELS.register_module(
    Tasks.dialog_intent_prediction, module_name=r'space-intent')
@MODELS.register_module(Tasks.dialog_intent_prediction, module_name=r'space')
 class DialogIntentModel(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
--- a/modelscope/models/nlp/space/dialog_modeling_model.py
+++ b/modelscope/models/nlp/space/dialog_modeling_model.py
@@ -1,11 +1,10 @@
 import os
 from typing import Any, Dict, Optional

 from modelscope.preprocessors.space.fields.gen_field import \
    MultiWOZBPETextField
 from modelscope.trainers.nlp.space.trainers.gen_trainer import MultiWOZTrainer
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Tasks
 from ....preprocessors.space.fields.gen_field import MultiWOZBPETextField
 from ....trainers.nlp.space.trainers.gen_trainer import MultiWOZTrainer
 from ....utils.config import Config
 from ....utils.constant import Tasks
 from ...base import Model, Tensor
 from ...builder import MODELS
 from .model.generator import Generator
@@ -14,7 +13,7 @@ from .model.model_base import ModelBase
 __all__ = ['DialogModelingModel']


@MODELS.register_module(Tasks.dialog_modeling, module_name=r'space-modeling')
@MODELS.register_module(Tasks.dialog_modeling, module_name=r'space')
 class DialogModelingModel(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
--- a/modelscope/models/nlp/space/dialog_state_tracking.py
+++ b/modelscope/models/nlp/space/dialog_state_tracking.py
@@ -11,7 +11,7 @@ from .model.model_base import ModelBase
 __all__ = ['DialogStateTrackingModel']


@MODELS.register_module(Tasks.dialog_state_tracking, module_name=r'space-dst')
@MODELS.register_module(Tasks.dialog_state_tracking, module_name=r'space')
 class DialogStateTrackingModel(Model):

    def __init__(self, model_dir: str, *args, **kwargs):
--- a/modelscope/models/nlp/space/model/gen_unified_transformer.py
+++ b/modelscope/models/nlp/space/model/gen_unified_transformer.py
@@ -3,8 +3,7 @@ IntentUnifiedTransformer
 """
 import torch

 from modelscope.models.nlp.space.model.unified_transformer import \
    UnifiedTransformer
 from .unified_transformer import UnifiedTransformer


 class GenUnifiedTransformer(UnifiedTransformer):
--- a/modelscope/models/nlp/space/model/intent_unified_transformer.py
+++ b/modelscope/models/nlp/space/model/intent_unified_transformer.py
@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from modelscope.utils.nlp.space.criterions import compute_kl_loss
 from .....utils.nlp.space.criterions import compute_kl_loss
 from .unified_transformer import UnifiedTransformer


--- a/modelscope/models/nlp/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -7,10 +7,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from modelscope.models.nlp.space.model.model_base import ModelBase
 from modelscope.models.nlp.space.modules.embedder import Embedder
 from modelscope.models.nlp.space.modules.transformer_block import \
    TransformerBlock
 from ..modules.embedder import Embedder
 from ..modules.transformer_block import TransformerBlock
 from .model_base import ModelBase


 class UnifiedTransformer(ModelBase):
--- a/modelscope/models/nlp/space/modules/transformer_block.py
+++ b/modelscope/models/nlp/space/modules/transformer_block.py
@@ -5,9 +5,8 @@ TransformerBlock class.
 import torch
 import torch.nn as nn

 from modelscope.models.nlp.space.modules.feedforward import FeedForward
 from modelscope.models.nlp.space.modules.multihead_attention import \
    MultiheadAttention
 from .feedforward import FeedForward
 from .multihead_attention import MultiheadAttention


 class TransformerBlock(nn.Module):
--- a/modelscope/pipelines/init.py
+++ b/modelscope/pipelines/init.py
@@ -1,7 +1,4 @@
 from .audio import LinearAECPipeline
 # from .audio import LinearAECPipeline
 from .base import Pipeline
 from .builder import pipeline
 from .cv import *  # noqa F403
 from .multi_modal import *  # noqa F403
 from .nlp import *  # noqa F403
 from .nlp.space import *  # noqa F403
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -7,6 +7,7 @@ import scipy.io.wavfile as wav
 import torch
 import yaml

 from modelscope.metainfo import Pipelines
 from modelscope.preprocessors.audio import LinearAECAndFbank
 from modelscope.utils.constant import ModelFile, Tasks
 from ..base import Pipeline
@@ -39,7 +40,8 @@ def initialize_config(module_cfg):


@PIPELINES.register_module(
    Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
    Tasks.speech_signal_process,
    module_name=Pipelines.speech_dfsmn_aec_psm_16k)
 class LinearAECPipeline(Pipeline):
    r"""AEC Inference Pipeline only support 16000 sample rate.

--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -3,6 +3,7 @@ from typing import Any, Dict, List

 import numpy as np

 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.audio.tts.am import SambertNetHifi16k
 from modelscope.models.audio.tts.vocoder import Hifigan16k
@@ -15,7 +16,7 @@ __all__ = ['TextToSpeechSambertHifigan16kPipeline']


@PIPELINES.register_module(
    Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k')
    Tasks.text_to_speech, module_name=Pipelines.sambert_hifigan_16k_tts)
 class TextToSpeechSambertHifigan16kPipeline(Pipeline):

    def __init__(self,
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -4,19 +4,17 @@ import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Generator, List, Union

 from maas_hub.snapshot_download import snapshot_download

 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.base import Model
 from modelscope.preprocessors import Preprocessor
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.config import Config
 from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.logger import get_logger
 from .outputs import TASK_OUTPUTS
 from .util import is_model_name
 from .util import is_model, is_official_hub_path

 Tensor = Union['torch.Tensor', 'tf.Tensor']
 Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
 Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
 InputModel = Union[str, Model]

 output_keys = [
@@ -29,14 +27,10 @@ class Pipeline(ABC):

    def initiate_single_model(self, model):
        logger.info(f'initiate model from {model}')
        # TODO @wenmeng.zwm replace model.startswith('damo/') with get_model
        if isinstance(model, str) and model.startswith('damo/'):
            if not osp.exists(model):
                cache_path = get_model_cache_dir(model)
                model = cache_path if osp.exists(
                    cache_path) else snapshot_download(model)
            return Model.from_pretrained(model) if is_model_name(
                model) else model
        if isinstance(model, str) and is_official_hub_path(model):
            model = snapshot_download(
                model) if not osp.exists(model) else model
            return Model.from_pretrained(model) if is_model(model) else model
        elif isinstance(model, Model):
            return model
        else:
@@ -104,7 +98,7 @@ class Pipeline(ABC):

    def _process_single(self, input: Input, *args,
                        **post_kwargs) -> Dict[str, Any]:
        out = self.preprocess(input, **post_kwargs)
        out = self.preprocess(input)
        out = self.forward(out)
        out = self.postprocess(out, **post_kwargs)
        self._check_output(out)
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -1,33 +1,49 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os.path as osp
 from typing import List, Union

 from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import Tasks
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import read_config
 from modelscope.utils.registry import Registry, build_from_cfg
 from .base import Pipeline
 from .util import is_official_hub_path

 PIPELINES = Registry('pipelines')

 DEFAULT_MODEL_FOR_PIPELINE = {
    # TaskName: (pipeline_module_name, model_repo)
    Tasks.word_segmentation:
    ('structbert-chinese-word-segmentation',
    (Pipelines.word_segmentation,
     'damo/nlp_structbert_word-segmentation_chinese-base'),
    Tasks.sentence_similarity:
    ('sbert-base-chinese-sentence-similarity',
    (Pipelines.sentence_similarity,
     'damo/nlp_structbert_sentence-similarity_chinese-base'),
    Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
    Tasks.text_classification:
    ('bert-sentiment-analysis', 'damo/bert-base-sst2'),
    Tasks.text_generation: ('palm2.0',
    Tasks.nli: (Pipelines.nli, 'damo/nlp_structbert_nli_chinese-base'),
    Tasks.sentiment_classification:
    (Pipelines.sentiment_classification,
     'damo/nlp_structbert_sentiment-classification_chinese-base'),
    Tasks.text_classification: ('bert-sentiment-analysis',
                                'damo/bert-base-sst2'),
    Tasks.image_matting: (Pipelines.image_matting,
                          'damo/cv_unet_image-matting'),
    Tasks.text_classification: (Pipelines.sentiment_analysis,
                                'damo/bert-base-sst2'),
    Tasks.text_generation: (Pipelines.text_generation,
                            'damo/nlp_palm2.0_text-generation_chinese-base'),
    Tasks.image_captioning: ('ofa', None),
    Tasks.image_captioning: (Pipelines.image_caption,
                             'damo/ofa_image-caption_coco_large_en'),
    Tasks.image_generation:
    ('person-image-cartoon',
    (Pipelines.person_image_cartoon,
     'damo/cv_unet_person-image-cartoon_compound-models'),
    Tasks.ocr_detection: (Pipelines.ocr_detection,
                          'damo/cv_resnet18_ocr-detection-line-level_damo'),
    Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
    Tasks.action_recognition: (Pipelines.action_recognition,
                               'damo/cv_TAdaConv_action-recognition'),
 }


@@ -84,30 +100,40 @@ def pipeline(task: str = None,
    if task is None and pipeline_name is None:
        raise ValueError('task or pipeline_name is required')

    assert isinstance(model, (type(None), str, Model, list)), \
        f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'

    if pipeline_name is None:
        # get default pipeline for this task
        if isinstance(model, str) \
           or (isinstance(model, list) and isinstance(model[0], str)):

            # if is_model_name(model):
            if (isinstance(model, str) and model.startswith('damo/')) \
               or (isinstance(model, list) and model[0].startswith('damo/')) \
               or (isinstance(model, str) and osp.exists(model)):
                # TODO @wenmeng.zwm  add support when model is a str of modelhub address
                # read pipeline info from modelhub configuration file.
                pipeline_name, default_model_repo = get_default_pipeline_info(
                    task)
            if is_official_hub_path(model):
                # read config file from hub and parse
                cfg = read_config(model) if isinstance(
                    model, str) else read_config(model[0])
                assert hasattr(
                    cfg,
                    'pipeline'), 'pipeline config is missing from config file.'
                pipeline_name = cfg.pipeline.type
            else:
                # used for test case, when model is str and is not hub path
                pipeline_name = get_pipeline_by_model_name(task, model)
        elif isinstance(model, Model) or \
                (isinstance(model, list) and isinstance(model[0], Model)):
            # get pipeline info from Model object
            first_model = model[0] if isinstance(model, list) else model
            if not hasattr(first_model, 'pipeline'):
                # model is instantiated by user, we should parse config again
                cfg = read_config(first_model.model_dir)
                assert hasattr(
                    cfg,
                    'pipeline'), 'pipeline config is missing from config file.'
                first_model.pipeline = cfg.pipeline
            pipeline_name = first_model.pipeline.type
        else:
            pipeline_name, default_model_repo = get_default_pipeline_info(task)

        if model is None:
            model = default_model_repo

    assert isinstance(model, (type(None), str, Model, list)), \
        f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'

    cfg = ConfigDict(type=pipeline_name, model=model)

    if kwargs:
--- a/modelscope/pipelines/cv/init.py
+++ b/modelscope/pipelines/cv/init.py
@@ -1,2 +1,4 @@
 from .action_recognition_pipeline import ActionRecognitionPipeline
 from .image_cartoon_pipeline import ImageCartoonPipeline
 from .image_matting_pipeline import ImageMattingPipeline
 from .ocr_detection_pipeline import OCRDetectionPipeline
--- a/modelscope/pipelines/cv/action_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -0,0 +1,65 @@
 import math
 import os.path as osp
 from typing import Any, Dict

 import cv2
 import numpy as np
 import PIL
 import torch

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.action_recognition.models import BaseVideoModel
 from modelscope.pipelines.base import Input
 from modelscope.preprocessors.video import ReadVideoData
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.action_recognition, module_name=Pipelines.action_recognition)
 class ActionRecognitionPipeline(Pipeline):

    def __init__(self, model: str):
        super().__init__(model=model)
        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
        logger.info(f'loading model from {model_path}')
        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
        logger.info(f'loading config from {config_path}')
        self.cfg = Config.from_file(config_path)
        self.infer_model = BaseVideoModel(cfg=self.cfg).cuda()
        self.infer_model.eval()
        self.infer_model.load_state_dict(torch.load(model_path)['model_state'])
        self.label_mapping = self.cfg.label_mapping
        logger.info('load model done')

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if isinstance(input, str):
            video_input_data = ReadVideoData(self.cfg, input).cuda()
        else:
            raise TypeError(f'input should be a str,'
                            f'  but got {type(input)}')
        result = {'video_data': video_input_data}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        pred = self.perform_inference(input['video_data'])
        output_label = self.label_mapping[str(pred)]
        return {'output_label': output_label}

    @torch.no_grad()
    def perform_inference(self, data, max_bsz=4):
        iter_num = math.ceil(data.size(0) / max_bsz)
        preds_list = []
        for i in range(iter_num):
            preds_list.append(
                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz])[0])
        pred = torch.cat(preds_list, dim=0)
        return pred.mean(dim=0).argmax().item()

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -6,6 +6,7 @@ import numpy as np
 import PIL
 import tensorflow as tf

 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.cartoon.facelib.facer import FaceAna
 from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import (
    get_reference_facial_points, warp_and_crop_face)
@@ -25,7 +26,7 @@ logger = get_logger()


@PIPELINES.register_module(
    Tasks.image_generation, module_name='person-image-cartoon')
    Tasks.image_generation, module_name=Pipelines.person_image_cartoon)
 class ImageCartoonPipeline(Pipeline):

    def __init__(self, model: str):
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -5,6 +5,7 @@ import cv2
 import numpy as np
 import PIL

 from modelscope.metainfo import Pipelines
 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import ModelFile, Tasks
@@ -16,7 +17,7 @@ logger = get_logger()


@PIPELINES.register_module(
    Tasks.image_matting, module_name=Tasks.image_matting)
    Tasks.image_matting, module_name=Pipelines.image_matting)
 class ImageMattingPipeline(Pipeline):

    def __init__(self, model: str):
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -0,0 +1,168 @@
 import math
 import os
 import os.path as osp
 import sys
 from typing import Any, Dict, List, Tuple, Union

 import cv2
 import numpy as np
 import PIL
 import tensorflow as tf
 import tf_slim as slim

 from modelscope.metainfo import Pipelines
 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES
 from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1
 tf.compat.v1.disable_eager_execution()

 logger = get_logger()

 # constant
 RBOX_DIM = 5
 OFFSET_DIM = 6
 WORD_POLYGON_DIM = 8
 OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]

 FLAGS = tf.app.flags.FLAGS
 tf.app.flags.DEFINE_float('node_threshold', 0.4,
                          'Confidence threshold for nodes')
 tf.app.flags.DEFINE_float('link_threshold', 0.6,
                          'Confidence threshold for links')


@PIPELINES.register_module(
    Tasks.ocr_detection, module_name=Pipelines.ocr_detection)
 class OCRDetectionPipeline(Pipeline):

    def __init__(self, model: str):
        super().__init__(model=model)
        model_path = osp.join(
            osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
            'checkpoint-80000')

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        self._session = tf.Session(config=config)
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0),
            dtype=tf.int64,
            trainable=False)
        variable_averages = tf.train.ExponentialMovingAverage(
            0.997, global_step)
        self.input_images = tf.placeholder(
            tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
        self.output = {}

        # detector
        detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
        all_maps = detector.build_model(self.input_images, is_training=False)

        # decode local predictions
        all_nodes, all_links, all_reg = [], [], []
        for i, maps in enumerate(all_maps):
            cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)

            cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))

            lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2])
            lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:])
            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)

            all_nodes.append(cls_prob)
            all_links.append(lnk_prob)
            all_reg.append(reg_maps)

        # decode segments and links
        image_size = tf.shape(self.input_images)[1:3]
        segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
            image_size,
            all_nodes,
            all_links,
            all_reg,
            anchor_sizes=list(detector.anchor_sizes))

        # combine segments
        combined_rboxes, combined_counts = ops.combine_segments_python(
            segments, group_indices, segment_counts)
        self.output['combined_rboxes'] = combined_rboxes
        self.output['combined_counts'] = combined_counts

        with self._session.as_default() as sess:
            logger.info(f'loading model from {model_path}')
            # load model
            model_loader = tf.train.Saver(
                variable_averages.variables_to_restore())
            model_loader.restore(sess, model_path)

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if isinstance(input, str):
            img = np.array(load_image(input))
        elif isinstance(input, PIL.Image.Image):
            img = np.array(input.convert('RGB'))
        elif isinstance(input, np.ndarray):
            if len(input.shape) == 2:
                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
            img = input[:, :, ::-1]  # in rgb order
        else:
            raise TypeError(f'input should be either str, PIL.Image,'
                            f' np.array, but got {type(input)}')
        h, w, c = img.shape
        img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
        img_pad[:h, :w, :] = img

        resize_size = 1024
        img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
        img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
        img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
                                                   dtype=np.float32)

        resize_size = tf.stack([resize_size, resize_size])
        orig_size = tf.stack([max(h, w), max(h, w)])
        self.output['orig_size'] = orig_size
        self.output['resize_size'] = resize_size

        result = {'img': np.expand_dims(img_pad_resize, axis=0)}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        with self._session.as_default():
            feed_dict = {self.input_images: input['img']}
            sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
            return sess_outputs

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        rboxes = inputs['combined_rboxes'][0]
        count = inputs['combined_counts'][0]
        rboxes = rboxes[:count, :]

        # convert rboxes to polygons and find its coordinates on the original image
        orig_h, orig_w = inputs['orig_size']
        resize_h, resize_w = inputs['resize_size']
        polygons = utils.rboxes_to_polygons(rboxes)
        scale_y = float(orig_h) / float(resize_h)
        scale_x = float(orig_w) / float(resize_w)

        # confine polygons inside image
        polygons[:, ::2] = np.maximum(
            0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
        polygons[:, 1::2] = np.maximum(
            0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
        polygons = np.round(polygons).astype(np.int32)

        # nms
        dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()]
        dt_nms = utils.nms_python(dt_n9)
        dt_polygons = np.array([o[:8] for o in dt_nms])

        result = {'det_polygons': dt_polygons}
        return result
--- a/modelscope/pipelines/cv/ocr_utils/init.py
+++ b/modelscope/pipelines/cv/ocr_utils/init.py
--- a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
+++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
@@ -0,0 +1,158 @@
 import tensorflow as tf
 import tf_slim as slim

 from . import ops, resnet18_v1, resnet_utils

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1

 # constants
 OFFSET_DIM = 6

 N_LOCAL_LINKS = 8
 N_CROSS_LINKS = 4
 N_SEG_CLASSES = 2
 N_LNK_CLASSES = 4

 POS_LABEL = 1
 NEG_LABEL = 0


 class SegLinkDetector():

    def __init__(self):
        self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.]

    def _detection_classifier(self,
                              maps,
                              ksize,
                              weight_decay,
                              cross_links=False,
                              scope=None):

        with tf.variable_scope(scope):
            seg_depth = N_SEG_CLASSES
            if cross_links:
                lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS)
            else:
                lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS
            reg_depth = OFFSET_DIM
            map_depth = maps.get_shape()[3]
            inter_maps, inter_relu = ops.conv2d(
                maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter')

            dir_maps, dir_relu = ops.conv2d(
                inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir')
            cen_maps, cen_relu = ops.conv2d(
                inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen')
            pol_maps, pol_relu = ops.conv2d(
                inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol')
            concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1)
            _, lnk_embedding = ops.conv_relu(
                concat_relu, 12, 256, 1, 1, scope='lnk_embedding')
            lnk_maps, lnk_relu = ops.conv2d(
                inter_relu + lnk_embedding,
                256,
                lnk_depth,
                ksize,
                1,
                'SAME',
                scope='conv_lnk')

            char_seg_maps, char_seg_relu = ops.conv2d(
                inter_relu,
                256,
                seg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_char_cls')
            char_reg_maps, char_reg_relu = ops.conv2d(
                inter_relu,
                256,
                reg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_char_reg')
            concat_char_relu = tf.concat([char_seg_relu, char_reg_relu],
                                         axis=-1)
            _, char_embedding = ops.conv_relu(
                concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding')
            seg_maps, seg_relu = ops.conv2d(
                inter_relu + char_embedding,
                256,
                seg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_cls')
            reg_maps, reg_relu = ops.conv2d(
                inter_relu + char_embedding,
                256,
                reg_depth,
                ksize,
                1,
                'SAME',
                scope='conv_reg')

        return seg_relu, lnk_relu, reg_relu

    def _build_cnn(self, images, weight_decay, is_training):
        with slim.arg_scope(
                resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)):
            logits, end_points = resnet18_v1.resnet_v1_18(
                images, is_training=is_training, scope='resnet_v1_18')

        outputs = {
            'conv3_3': end_points['pool1'],
            'conv4_3': end_points['pool2'],
            'fc7': end_points['pool3'],
            'conv8_2': end_points['pool4'],
            'conv9_2': end_points['pool5'],
            'conv10_2': end_points['pool6'],
        }
        return outputs

    def build_model(self, images, is_training=True, scope=None):

        weight_decay = 5e-4  # FLAGS.weight_decay
        cnn_outputs = self._build_cnn(images, weight_decay, is_training)
        det_0 = self._detection_classifier(
            cnn_outputs['conv3_3'],
            3,
            weight_decay,
            cross_links=False,
            scope='dete_0')
        det_1 = self._detection_classifier(
            cnn_outputs['conv4_3'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_1')
        det_2 = self._detection_classifier(
            cnn_outputs['fc7'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_2')
        det_3 = self._detection_classifier(
            cnn_outputs['conv8_2'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_3')
        det_4 = self._detection_classifier(
            cnn_outputs['conv9_2'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_4')
        det_5 = self._detection_classifier(
            cnn_outputs['conv10_2'],
            3,
            weight_decay,
            cross_links=True,
            scope='dete_5')
        outputs = [det_0, det_1, det_2, det_3, det_4, det_5]
        return outputs
--- a/modelscope/pipelines/cv/ocr_utils/ops.py
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
--- a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
@@ -0,0 +1,432 @@
 """Contains definitions for the original form of Residual Networks.
 The 'v1' residual networks (ResNets) implemented in this module were proposed
 by:
 [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
 Other variants were introduced in:
 [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
 The networks defined in this module utilize the bottleneck building block of
 [1] with projection shortcuts only for increasing depths. They employ batch
 normalization *after* every weight layer. This is the architecture used by
 MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
 ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
 architecture and the alternative 'v2' architecture of [2] which uses batch
 normalization *before* every weight layer in the so-called full pre-activation
 units.
 Typical use:
   from tensorflow.contrib.slim.nets import resnet_v1
 ResNet-101 for image classification into 1000 classes:
   # inputs has shape [batch, 224, 224, 3]
   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
 ResNet-101 for semantic segmentation into 21 classes:
   # inputs has shape [batch, 513, 513, 3]
   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
      net, end_points = resnet_v1.resnet_v1_101(inputs,
                                                21,
                                                is_training=False,
                                                global_pool=False,
                                                output_stride=16)
 """
 import tensorflow as tf
 import tf_slim as slim

 from . import resnet_utils

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1

 resnet_arg_scope = resnet_utils.resnet_arg_scope


@slim.add_arg_scope
 def basicblock(inputs,
               depth,
               depth_bottleneck,
               stride,
               rate=1,
               outputs_collections=None,
               scope=None):
    """Bottleneck residual unit variant with BN after convolutions.
    This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
    its definition. Note that we use here the bottleneck variant which has an
    extra bottleneck layer.
    When putting together two consecutive ResNet blocks that use this unit, one
    should use stride = 2 in the last unit of the first block.
    Args:
      inputs: A tensor of size [batch, height, width, channels].
      depth: The depth of the ResNet unit output.
      depth_bottleneck: The depth of the bottleneck layers.
      stride: The ResNet unit's stride. Determines the amount of downsampling of
        the units output compared to its input.
      rate: An integer, rate for atrous convolution.
      outputs_collections: Collection to add the ResNet unit output.
      scope: Optional variable_scope.
    Returns:
      The ResNet unit's output.
    """
    with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
        if depth == depth_in:
            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
        else:
            shortcut = slim.conv2d(
                inputs,
                depth, [1, 1],
                stride=stride,
                activation_fn=None,
                scope='shortcut')

        residual = resnet_utils.conv2d_same(
            inputs, depth, 3, stride, rate=rate, scope='conv1')
        residual = resnet_utils.conv2d_same(
            residual, depth, 3, 1, rate=rate, scope='conv2')

        output = tf.nn.relu(residual + shortcut)

        return slim.utils.collect_named_outputs(outputs_collections,
                                                sc.original_name_scope, output)


@slim.add_arg_scope
 def bottleneck(inputs,
               depth,
               depth_bottleneck,
               stride,
               rate=1,
               outputs_collections=None,
               scope=None):
    """Bottleneck residual unit variant with BN after convolutions.
    This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
    its definition. Note that we use here the bottleneck variant which has an
    extra bottleneck layer.
    When putting together two consecutive ResNet blocks that use this unit, one
    should use stride = 2 in the last unit of the first block.
    Args:
      inputs: A tensor of size [batch, height, width, channels].
      depth: The depth of the ResNet unit output.
      depth_bottleneck: The depth of the bottleneck layers.
      stride: The ResNet unit's stride. Determines the amount of downsampling of
        the units output compared to its input.
      rate: An integer, rate for atrous convolution.
      outputs_collections: Collection to add the ResNet unit output.
      scope: Optional variable_scope.
    Returns:
      The ResNet unit's output.
    """
    with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
        if depth == depth_in:
            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
        else:
            shortcut = slim.conv2d(
                inputs,
                depth, [1, 1],
                stride=stride,
                activation_fn=None,
                scope='shortcut')

        residual = slim.conv2d(
            inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1')
        residual = resnet_utils.conv2d_same(
            residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2')
        residual = slim.conv2d(
            residual,
            depth, [1, 1],
            stride=1,
            activation_fn=None,
            scope='conv3')

        output = tf.nn.relu(shortcut + residual)

        return slim.utils.collect_named_outputs(outputs_collections,
                                                sc.original_name_scope, output)


 def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              spatial_squeeze=True,
              reuse=None,
              scope=None):
    """Generator for v1 ResNet models.
    This function generates a family of ResNet v1 models. See the resnet_v1_*()
    methods for specific model instantiations, obtained by selecting different
    block instantiations that produce ResNets of various depths.
    Training for image classification on Imagenet is usually done with [224, 224]
    inputs, resulting in [7, 7] feature maps at the output of the last ResNet
    block for the ResNets defined in [1] that have nominal stride equal to 32.
    However, for dense prediction tasks we advise that one uses inputs with
    spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
    this case the feature maps at the ResNet output will have spatial shape
    [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
    and corners exactly aligned with the input image corners, which greatly
    facilitates alignment of the features to the image. Using as input [225, 225]
    images results in [8, 8] feature maps at the output of the last ResNet block.
    For dense prediction tasks, the ResNet needs to run in fully-convolutional
    (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
    have nominal stride equal to 32 and a good choice in FCN mode is to use
    output_stride=16 in order to increase the density of the computed features at
    small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
    Args:
      inputs: A tensor of size [batch, height_in, width_in, channels].
      blocks: A list of length equal to the number of ResNet blocks. Each element
        is a resnet_utils.Block object describing the units in the block.
      num_classes: Number of predicted classes for classification tasks. If None
        we return the features before the logit layer.
      is_training: whether is training or not.
      global_pool: If True, we perform global average pooling before computing the
        logits. Set to True for image classification, False for dense prediction.
      output_stride: If None, then the output will be computed at the nominal
        network stride. If output_stride is not None, it specifies the requested
        ratio of input to output spatial resolution.
      include_root_block: If True, include the initial convolution followed by
        max-pooling, if False excludes it.
      spatial_squeeze: if True, logits is of shape [B, C], if false logits is
          of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
      reuse: whether or not the network and its variables should be reused. To be
        able to reuse 'scope' must be given.
      scope: Optional variable_scope.
    Returns:
      net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
        If global_pool is False, then height_out and width_out are reduced by a
        factor of output_stride compared to the respective height_in and width_in,
        else both height_out and width_out equal one. If num_classes is None, then
        net is the output of the last ResNet block, potentially after global
        average pooling. If num_classes is not None, net contains the pre-softmax
        activations.
      end_points: A dictionary from components of the network to the corresponding
        activation.
    Raises:
      ValueError: If the target output_stride is not valid.
    """
    with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.name + '_end_points'
        with slim.arg_scope(
            [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
                outputs_collections=end_points_collection):
            with slim.arg_scope([slim.batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    if output_stride is not None:
                        if output_stride % 4 != 0:
                            raise ValueError(
                                'The output_stride needs to be a multiple of 4.'
                            )
                        output_stride /= 4
                    net = resnet_utils.conv2d_same(
                        net, 64, 7, stride=2, scope='conv1')
                    net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
                    net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')

                    net = slim.utils.collect_named_outputs(
                        end_points_collection, 'pool2', net)

                net = resnet_utils.stack_blocks_dense(net, blocks,
                                                      output_stride)

                end_points = slim.utils.convert_collection_to_dict(
                    end_points_collection)

                end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2']
                end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2']
                end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2']
                end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2']
                end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2']
                end_points['pool6'] = net

                return net, end_points


 resnet_v1.default_image_size = 224


 def resnet_v1_18(inputs,
                 num_classes=None,
                 is_training=True,
                 global_pool=True,
                 output_stride=None,
                 spatial_squeeze=True,
                 reuse=None,
                 scope='resnet_v1_18'):
    """ResNet-18 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', basicblock,
                           [(64, 64, 1)] + [(64, 64, 1)]),
        resnet_utils.Block('block2', basicblock,
                           [(128, 128, 1)] + [(128, 128, 1)]),
        resnet_utils.Block('block3', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
        resnet_utils.Block('block4', basicblock,
                           [(512, 512, 2)] + [(512, 512, 1)]),
        resnet_utils.Block('block5', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
        resnet_utils.Block('block6', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
        resnet_utils.Block('block7', basicblock,
                           [(256, 256, 2)] + [(256, 256, 1)]),
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_18.default_image_size = resnet_v1.default_image_size


 def resnet_v1_50(inputs,
                 num_classes=None,
                 is_training=True,
                 global_pool=True,
                 output_stride=None,
                 spatial_squeeze=True,
                 reuse=None,
                 scope='resnet_v1_50'):
    """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 3 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 5 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck,
                           [(2048, 512, 1)] * 3 + [(2048, 512, 2)]),
        resnet_utils.Block('block5', bottleneck,
                           [(1024, 256, 1)] * 2 + [(1024, 256, 2)]),
        resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2),
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_50.default_image_size = resnet_v1.default_image_size


 def resnet_v1_101(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_101'):
    """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 3 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 22 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_101.default_image_size = resnet_v1.default_image_size


 def resnet_v1_152(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_152'):
    """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 7 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_152.default_image_size = resnet_v1.default_image_size


 def resnet_v1_200(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_200'):
    """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block('block1', bottleneck,
                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block('block2', bottleneck,
                           [(512, 128, 1)] * 23 + [(512, 128, 2)]),
        resnet_utils.Block('block3', bottleneck,
                           [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
    ]
    return resnet_v1(
        inputs,
        blocks,
        num_classes,
        is_training,
        global_pool=global_pool,
        output_stride=output_stride,
        include_root_block=True,
        spatial_squeeze=spatial_squeeze,
        reuse=reuse,
        scope=scope)


 resnet_v1_200.default_image_size = resnet_v1.default_image_size

 if __name__ == '__main__':
    input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input')
    with slim.arg_scope(resnet_arg_scope()) as sc:
        logits = resnet_v1_50(input)
--- a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
@@ -0,0 +1,231 @@
 """Contains building blocks for various versions of Residual Networks.
 Residual networks (ResNets) were proposed in:
  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
 More variants were introduced in:
  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
 We can obtain different ResNet variants by changing the network depth, width,
 and form of residual unit. This module implements the infrastructure for
 building them. Concrete ResNet units and full ResNet networks are implemented in
 the accompanying resnet_v1.py and resnet_v2.py modules.
 Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
 implementation we subsample the output activations in the last residual unit of
 each block, instead of subsampling the input activations in the first residual
 unit of each block. The two implementations give identical results but our
 implementation is more memory efficient.
 """

 import collections

 import tensorflow as tf
 import tf_slim as slim

 if tf.__version__ >= '2.0':
    tf = tf.compat.v1


 class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
    """A named tuple describing a ResNet block.
    Its parts are:
      scope: The scope of the `Block`.
      unit_fn: The ResNet unit function which takes as input a `Tensor` and
        returns another `Tensor` with the output of the ResNet unit.
      args: A list of length equal to the number of units in the `Block`. The list
        contains one (depth, depth_bottleneck, stride) tuple for each unit in the
        block to serve as argument to unit_fn.
    """


 def subsample(inputs, factor, scope=None):
    """Subsamples the input along the spatial dimensions.
    Args:
      inputs: A `Tensor` of size [batch, height_in, width_in, channels].
      factor: The subsampling factor.
      scope: Optional variable_scope.
    Returns:
      output: A `Tensor` of size [batch, height_out, width_out, channels] with the
        input, either intact (if factor == 1) or subsampled (if factor > 1).
    """
    if factor == 1:
        return inputs
    else:
        return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)


 def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
    """Strided 2-D convolution with 'SAME' padding.
    When stride > 1, then we do explicit zero-padding, followed by conv2d with
    'VALID' padding.
    Note that
       net = conv2d_same(inputs, num_outputs, 3, stride=stride)
    is equivalent to
       net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
       net = subsample(net, factor=stride)
    whereas
       net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
    is different when the input's height or width is even, which is why we add the
    current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
    Args:
      inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
      num_outputs: An integer, the number of output filters.
      kernel_size: An int with the kernel_size of the filters.
      stride: An integer, the output stride.
      rate: An integer, rate for atrous convolution.
      scope: Scope.
    Returns:
      output: A 4-D tensor of size [batch, height_out, width_out, channels] with
        the convolution output.
    """
    if stride == 1:
        return slim.conv2d(
            inputs,
            num_outputs,
            kernel_size,
            stride=1,
            rate=rate,
            padding='SAME',
            scope=scope)
    else:
        kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
        pad_total = kernel_size_effective - 1
        pad_beg = pad_total // 2
        pad_end = pad_total - pad_beg
        inputs = tf.pad(
            inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
        return slim.conv2d(
            inputs,
            num_outputs,
            kernel_size,
            stride=stride,
            rate=rate,
            padding='VALID',
            scope=scope)


@slim.add_arg_scope
 def stack_blocks_dense(net,
                       blocks,
                       output_stride=None,
                       outputs_collections=None):
    """Stacks ResNet `Blocks` and controls output feature density.
    First, this function creates scopes for the ResNet in the form of
    'block_name/unit_1', 'block_name/unit_2', etc.
    Second, this function allows the user to explicitly control the ResNet
    output_stride, which is the ratio of the input to output spatial resolution.
    This is useful for dense prediction tasks such as semantic segmentation or
    object detection.
    Most ResNets consist of 4 ResNet blocks and subsample the activations by a
    factor of 2 when transitioning between consecutive ResNet blocks. This results
    to a nominal ResNet output_stride equal to 8. If we set the output_stride to
    half the nominal network stride (e.g., output_stride=4), then we compute
    responses twice.
    Control of the output feature density is implemented by atrous convolution.
    Args:
      net: A `Tensor` of size [batch, height, width, channels].
      blocks: A list of length equal to the number of ResNet `Blocks`. Each
        element is a ResNet `Block` object describing the units in the `Block`.
      output_stride: If `None`, then the output will be computed at the nominal
        network stride. If output_stride is not `None`, it specifies the requested
        ratio of input to output spatial resolution, which needs to be equal to
        the product of unit strides from the start up to some level of the ResNet.
        For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
        then valid values for the output_stride are 1, 2, 6, 24 or None (which
        is equivalent to output_stride=24).
      outputs_collections: Collection to add the ResNet block outputs.
    Returns:
      net: Output tensor with stride equal to the specified output_stride.
    Raises:
      ValueError: If the target output_stride is not valid.
    """
    # The current_stride variable keeps track of the effective stride of the
    # activations. This allows us to invoke atrous convolution whenever applying
    # the next residual unit would result in the activations having stride larger
    # than the target output_stride.
    current_stride = 1

    # The atrous convolution rate parameter.
    rate = 1

    for block in blocks:
        with tf.variable_scope(block.scope, 'block', [net]):
            for i, unit in enumerate(block.args):
                if output_stride is not None and current_stride > output_stride:
                    raise ValueError(
                        'The target output_stride cannot be reached.')

                with tf.variable_scope(
                        'unit_%d' % (i + 1), values=[net]) as sc:
                    unit_depth, unit_depth_bottleneck, unit_stride = unit
                    # If we have reached the target output_stride, then we need to employ
                    # atrous convolution with stride=1 and multiply the atrous rate by the
                    # current unit's stride for use in subsequent layers.
                    if output_stride is not None and current_stride == output_stride:
                        net = block.unit_fn(
                            net,
                            depth=unit_depth,
                            depth_bottleneck=unit_depth_bottleneck,
                            stride=1,
                            rate=rate)
                        rate *= unit_stride

                    else:
                        net = block.unit_fn(
                            net,
                            depth=unit_depth,
                            depth_bottleneck=unit_depth_bottleneck,
                            stride=unit_stride,
                            rate=1)
                        current_stride *= unit_stride
                    net = slim.utils.collect_named_outputs(
                        outputs_collections, sc.name, net)

    if output_stride is not None and current_stride != output_stride:
        raise ValueError('The target output_stride cannot be reached.')

    return net


 def resnet_arg_scope(weight_decay=0.0001,
                     batch_norm_decay=0.997,
                     batch_norm_epsilon=1e-5,
                     batch_norm_scale=True):
    """Defines the default ResNet arg scope.
    TODO(gpapan): The batch-normalization related default values above are
      appropriate for use in conjunction with the reference ResNet models
      released at https://github.com/KaimingHe/deep-residual-networks. When
      training ResNets from scratch, they might need to be tuned.
    Args:
      weight_decay: The weight decay to use for regularizing the model.
      batch_norm_decay: The moving average decay when estimating layer activation
        statistics in batch normalization.
      batch_norm_epsilon: Small constant to prevent division by zero when
        normalizing activations by their variance in batch normalization.
      batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
        activations in the batch normalization layer.
    Returns:
      An `arg_scope` to use for the resnet models.
    """
    batch_norm_params = {
        'decay': batch_norm_decay,
        'epsilon': batch_norm_epsilon,
        'scale': batch_norm_scale,
        'updates_collections': tf.GraphKeys.UPDATE_OPS,
    }

    with slim.arg_scope(
        [slim.conv2d],
            weights_regularizer=slim.l2_regularizer(weight_decay),
            weights_initializer=slim.variance_scaling_initializer(),
            activation_fn=tf.nn.relu,
            normalizer_fn=slim.batch_norm,
            normalizer_params=batch_norm_params):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            # The following implies padding='SAME' for pool1, which makes feature
            # alignment easier for dense prediction tasks. This is also used in
            # https://github.com/facebook/fb.resnet.torch. However the accompanying
            # code of 'Deep Residual Learning for Image Recognition' uses
            # padding='VALID' for pool1. You can switch to that choice by setting
            # slim.arg_scope([slim.max_pool2d], padding='VALID').
            with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
                return arg_sc
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -0,0 +1,108 @@
 import cv2
 import numpy as np


 def rboxes_to_polygons(rboxes):
    """
    Convert rboxes to polygons
    ARGS
        `rboxes`: [n, 5]
    RETURN
        `polygons`: [n, 8]
    """

    theta = rboxes[:, 4:5]
    cxcy = rboxes[:, :2]
    half_w = rboxes[:, 2:3] / 2.
    half_h = rboxes[:, 3:4] / 2.
    v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w])
    v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h])
    p1 = cxcy - v1 - v2
    p2 = cxcy + v1 - v2
    p3 = cxcy + v1 + v2
    p4 = cxcy - v1 + v2
    polygons = np.hstack([p1, p2, p3, p4])
    return polygons


 def cal_width(box):
    pd1 = point_dist(box[0], box[1], box[2], box[3])
    pd2 = point_dist(box[4], box[5], box[6], box[7])
    return (pd1 + pd2) / 2


 def point_dist(x1, y1, x2, y2):
    return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1))


 def draw_polygons(img, polygons):
    for p in polygons.tolist():
        p = [int(o) for o in p]
        cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1)
        cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1)
        cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1)
        cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1)
    return img


 def nms_python(boxes):
    boxes = sorted(boxes, key=lambda x: -x[8])
    nms_flag = [True] * len(boxes)
    for i, a in enumerate(boxes):
        if not nms_flag[i]:
            continue
        else:
            for j, b in enumerate(boxes):
                if not j > i:
                    continue
                if not nms_flag[j]:
                    continue
                score_a = a[8]
                score_b = b[8]
                rbox_a = polygon2rbox(a[:8])
                rbox_b = polygon2rbox(b[:8])
                if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox(
                        rbox_b[:2], rbox_a):
                    if score_a > score_b:
                        nms_flag[j] = False
    boxes_nms = []
    for i, box in enumerate(boxes):
        if nms_flag[i]:
            boxes_nms.append(box)
    return boxes_nms


 def point_in_rbox(c, rbox):
    cx0, cy0 = c[0], c[1]
    cx1, cy1 = rbox[0], rbox[1]
    w, h = rbox[2], rbox[3]
    theta = rbox[4]
    dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta))
    dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta))
    return ((dist_x < w / 2.0) and (dist_y < h / 2.0))


 def polygon2rbox(polygon):
    x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6]
    y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7]
    c_x = (x1 + x2 + x3 + x4) / 4
    c_y = (y1 + y2 + y3 + y4) / 4
    w1 = point_dist(x1, y1, x2, y2)
    w2 = point_dist(x3, y3, x4, y4)
    h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2)
    h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4)
    h = h1 + h2
    w = (w1 + w2) / 2
    theta1 = np.arctan2(y2 - y1, x2 - x1)
    theta2 = np.arctan2(y3 - y4, x3 - x4)
    theta = (theta1 + theta2) / 2.0
    return [c_x, c_y, w, h, theta]


 def point_line_dist(px, py, x1, y1, x2, y2):
    eps = 1e-6
    dx = x2 - x1
    dy = y2 - y1
    div = np.sqrt(dx * dx + dy * dy) + eps
    dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
    return dist
--- a/modelscope/pipelines/multi_modal/init.py
+++ b/modelscope/pipelines/multi_modal/init.py
@@ -1 +1 @@
 from .image_caption_pipeline import ImageCaptionPipeline
 from .image_captioning_pipeline import ImageCaptionPipeline
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -0,0 +1,35 @@
 from typing import Any, Dict, Union

 from modelscope.metainfo import Pipelines
 from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Model, Pipeline
 from ..builder import PIPELINES

 logger = get_logger()


@PIPELINES.register_module(
    Tasks.image_captioning, module_name=Pipelines.image_caption)
 class ImageCaptionPipeline(Pipeline):

    def __init__(self,
                 model: Union[Model, str],
                 preprocessor: [Preprocessor] = None,
                 **kwargs):
        super().__init__()
        assert isinstance(model, str) or isinstance(model, Model), \
            'model must be a single str or OfaForImageCaptioning'
        if isinstance(model, str):
            pipe_model = Model.from_pretrained(model)
        elif isinstance(model, Model):
            pipe_model = model
        else:
            raise NotImplementedError
        if preprocessor is None and pipe_model:
            preprocessor = OfaImageCaptionPreprocessor(model_dir=model)
        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        return inputs
--- a/modelscope/pipelines/nlp/init.py
+++ b/modelscope/pipelines/nlp/init.py
@@ -1,7 +1,10 @@
 from .dialog_intent_prediction_pipeline import *  # noqa F403
 from .dialog_modeling_pipeline import *  # noqa F403
 from .dialog_state_tracking import *  # noqa F403
 from .fill_mask_pipeline import *  # noqa F403
 from .nli_pipeline import *  # noqa F403
 from .sentence_similarity_pipeline import *  # noqa F403
 from .sentiment_classification_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
 from .space.dialog_intent_prediction_pipeline import *  # noqa F403
 from .space.dialog_modeling_pipeline import *  # noqa F403
 from .space.dialog_state_tracking import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
 from .word_segmentation_pipeline import *  # noqa F403
--- a/modelscope/pipelines/nlp/space/dialog_intent_prediction_pipeline.py
+++ b/modelscope/pipelines/nlp/space/dialog_intent_prediction_pipeline.py
@@ -1,16 +1,18 @@
 from typing import Any, Dict, Optional
 from typing import Any, Dict

 from modelscope.models.nlp import DialogIntentModel
 from modelscope.preprocessors import DialogIntentPredictionPreprocessor
 from modelscope.utils.constant import Tasks
 from ...base import Input, Pipeline
 from ...builder import PIPELINES
 from ...metainfo import Pipelines
 from ...models.nlp import DialogIntentModel
 from ...preprocessors import DialogIntentPredictionPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES

 __all__ = ['DialogIntentPredictionPipeline']


@PIPELINES.register_module(
    Tasks.dialog_intent_prediction, module_name=r'space-intent')
    Tasks.dialog_intent_prediction,
    module_name=Pipelines.dialog_intent_prediction)
 class DialogIntentPredictionPipeline(Pipeline):

    def __init__(self, model: DialogIntentModel,
--- a/modelscope/pipelines/nlp/space/dialog_modeling_pipeline.py
+++ b/modelscope/pipelines/nlp/space/dialog_modeling_pipeline.py
@@ -3,14 +3,15 @@ from typing import Any, Dict, Optional
 from modelscope.models.nlp import DialogModelingModel
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
 from ...base import Pipeline, Tensor
 from ...builder import PIPELINES
 from ...metainfo import Pipelines
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

 __all__ = ['DialogModelingPipeline']


@PIPELINES.register_module(
    Tasks.dialog_modeling, module_name=r'space-modeling')
    Tasks.dialog_modeling, module_name=Pipelines.dialog_modeling)
 class DialogModelingPipeline(Pipeline):

    def __init__(self, model: DialogModelingModel,
--- a/modelscope/pipelines/nlp/dialog_state_tracking.py
+++ b/modelscope/pipelines/nlp/dialog_state_tracking.py
@@ -0,0 +1,45 @@
 from typing import Any, Dict

 from ...metainfo import Pipelines
 from ...models.nlp import DialogStateTrackingModel
 from ...preprocessors import DialogStateTrackingPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES

 __all__ = ['DialogStateTrackingPipeline']


@PIPELINES.register_module(
    Tasks.dialog_state_tracking, module_name=Pipelines.dialog_state_tracking)
 class DialogStateTrackingPipeline(Pipeline):

    def __init__(self, model: DialogStateTrackingModel,
                 preprocessor: DialogStateTrackingPreprocessor, **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

        Args:
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """

        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model = model
        # self.tokenizer = preprocessor.tokenizer

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """
        import numpy as np
        pred = inputs['pred']
        pos = np.where(pred == np.max(pred))

        result = {'pred': pred, 'label': pos[0]}

        return result
--- a/modelscope/pipelines/nlp/fill_mask_pipeline.py
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -0,0 +1,107 @@
 from typing import Any, Dict, Optional, Union

 import torch

 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp.masked_language_model import MaskedLanguageModelBase
 from ...preprocessors import FillMaskPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

 __all__ = ['FillMaskPipeline']


@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
 class FillMaskPipeline(Pipeline):

    def __init__(self,
                 model: Union[MaskedLanguageModelBase, str],
                 preprocessor: Optional[FillMaskPreprocessor] = None,
                 first_sequence='sentense',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction

        Args:
            model (MaskedLanguageModelBase): a model instance
            preprocessor (FillMaskPreprocessor): a preprocessor instance
        """
        fill_mask_model = model if isinstance(
            model, MaskedLanguageModelBase) else Model.from_pretrained(model)
        assert fill_mask_model.config is not None

        if preprocessor is None:
            preprocessor = FillMaskPreprocessor(
                fill_mask_model.model_dir,
                first_sequence=first_sequence,
                second_sequence=None)
        fill_mask_model.eval()
        super().__init__(
            model=fill_mask_model, preprocessor=preprocessor, **kwargs)

        self.preprocessor = preprocessor
        self.tokenizer = preprocessor.tokenizer
        self.mask_id = {'veco': 250001, 'sbert': 103}

        self.rep_map = {
            'sbert': {
                '[unused0]': '',
                '[PAD]': '',
                '[unused1]': '',
                r' +': ' ',
                '[SEP]': '',
                '[unused2]': '',
                '[CLS]': '',
                '[UNK]': ''
            },
            'veco': {
                r' +': ' ',
                '<mask>': '<q>',
                '<pad>': '',
                '<s>': '',
                '</s>': '',
                '<unk>': ' '
            }
        }

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """
        import numpy as np
        logits = inputs['logits'].detach().numpy()
        input_ids = inputs['input_ids'].detach().numpy()
        pred_ids = np.argmax(logits, axis=-1)
        model_type = self.model.config.model_type
        rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids,
                           input_ids)

        def rep_tokens(string, rep_map):
            for k, v in rep_map.items():
                string = string.replace(k, v)
            return string.strip()

        pred_strings = []
        for ids in rst_ids:  # batch
            # TODO vocab size is not stable

            if self.model.config.vocab_size == 21128:  # zh bert
                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
                pred_string = ''.join(pred_string)
            else:
                pred_string = self.tokenizer.decode(ids)
            pred_string = rep_tokens(pred_string, self.rep_map[model_type])
            pred_strings.append(pred_string)

        return {'text': pred_strings}
--- a/modelscope/pipelines/nlp/nli_pipeline.py
+++ b/modelscope/pipelines/nlp/nli_pipeline.py
@@ -0,0 +1,72 @@
 import uuid
 from typing import Any, Dict, Union

 import numpy as np
 import torch

 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForNLI
 from ...preprocessors import NLIPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline
 from ..builder import PIPELINES

 __all__ = ['NLIPipeline']


@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
 class NLIPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForNLI, str],
                 preprocessor: NLIPreprocessor = None,
                 first_sequence='first_sequence',
                 second_sequence='second_sequence',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

        Args:
            model (SbertForNLI): a model instance
            preprocessor (NLIPreprocessor): a preprocessor instance
        """
        assert isinstance(model, str) or isinstance(model, SbertForNLI), \
            'model must be a single str or SbertForNLI'
        model = model if isinstance(
            model, SbertForNLI) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = NLIPreprocessor(
                model.model_dir,
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        assert len(model.id2label) > 0

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self,
                    inputs: Dict[str, Any],
                    topk: int = 5) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """

        probs = inputs['probabilities'][0]
        num_classes = probs.shape[0]
        topk = min(topk, num_classes)
        top_indices = np.argpartition(probs, -topk)[-topk:]
        cls_ids = top_indices[np.argsort(probs[top_indices])]
        probs = probs[cls_ids].tolist()

        cls_names = [self.model.id2label[cid] for cid in cls_ids]

        return {'scores': probs, 'labels': cls_names}
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -1,11 +1,13 @@
 from typing import Any, Dict, Union

 import numpy as np
 import torch

 from modelscope.models.nlp import SbertForSentenceSimilarity
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForSentenceSimilarity
 from ...preprocessors import SequenceClassificationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Input, Pipeline
 from ..builder import PIPELINES

@@ -13,13 +15,14 @@ __all__ = ['SentenceSimilarityPipeline']


@PIPELINES.register_module(
    Tasks.sentence_similarity,
    module_name=r'sbert-base-chinese-sentence-similarity')
    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
 class SentenceSimilarityPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForSentenceSimilarity, str],
                 model: Union[Model, str],
                 preprocessor: SequenceClassificationPreprocessor = None,
                 first_sequence='first_sequence',
                 second_sequence='second_sequence',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction

@@ -35,14 +38,21 @@ class SentenceSimilarityPipeline(Pipeline):
        if preprocessor is None:
            preprocessor = SequenceClassificationPreprocessor(
                sc_model.model_dir,
                first_sequence='first_sequence',
                second_sequence='second_sequence')
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        sc_model.eval()
        super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)

        assert hasattr(self.model, 'id2label'), \
            'id2label map should be initalizaed in init function.'

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self, inputs: Dict[str, Any],
                    **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
@@ -0,0 +1,77 @@
 import os
 import uuid
 from typing import Any, Dict, Union

 import json
 import numpy as np
 import torch

 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForSentimentClassification
 from ...preprocessors import SentimentClassificationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Input, Pipeline
 from ..builder import PIPELINES

 __all__ = ['SentimentClassificationPipeline']


@PIPELINES.register_module(
    Tasks.sentiment_classification,
    module_name=Pipelines.sentiment_classification)
 class SentimentClassificationPipeline(Pipeline):

    def __init__(self,
                 model: Union[SbertForSentimentClassification, str],
                 preprocessor: SentimentClassificationPreprocessor = None,
                 first_sequence='first_sequence',
                 second_sequence='second_sequence',
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

        Args:
            model (SbertForSentimentClassification): a model instance
            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
        """
        assert isinstance(model, str) or isinstance(model, SbertForSentimentClassification), \
            'model must be a single str or SbertForSentimentClassification'
        model = model if isinstance(
            model,
            SbertForSentimentClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = SentimentClassificationPreprocessor(
                model.model_dir,
                first_sequence=first_sequence,
                second_sequence=second_sequence)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        assert len(model.id2label) > 0

    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self,
                    inputs: Dict[str, Any],
                    topk: int = 5) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """

        probs = inputs['probabilities'][0]
        num_classes = probs.shape[0]
        topk = min(topk, num_classes)
        top_indices = np.argpartition(probs, -topk)[-topk:]
        cls_ids = top_indices[np.argsort(probs[top_indices])]
        probs = probs[cls_ids].tolist()

        cls_names = [self.model.id2label[cid] for cid in cls_ids]

        return {'scores': probs, 'labels': cls_names}
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -2,6 +2,7 @@ from typing import Any, Dict, Union

 import numpy as np

 from modelscope.metainfo import Pipelines
 from modelscope.models.nlp import BertForSequenceClassification
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
@@ -13,7 +14,7 @@ __all__ = ['SequenceClassificationPipeline']


@PIPELINES.register_module(
    Tasks.text_classification, module_name=r'bert-sentiment-analysis')
    Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
 class SequenceClassificationPipeline(Pipeline):

    def __init__(self,
--- a/modelscope/pipelines/nlp/space/dialog_state_tracking.py
+++ b/modelscope/pipelines/nlp/space/dialog_state_tracking.py
@@ -1,46 +0,0 @@
 from typing import Any, Dict, Optional

 from modelscope.models.nlp import DialogModelingModel
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
 from ...base import Pipeline, Tensor
 from ...builder import PIPELINES

 __all__ = ['DialogStateTrackingPipeline']


@PIPELINES.register_module(
    Tasks.dialog_state_tracking, module_name=r'space-dst')
 class DialogStateTrackingPipeline(Pipeline):

    def __init__(self, model: DialogModelingModel,
                 preprocessor: DialogModelingPreprocessor, **kwargs):
        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction

        Args:
            model (SequenceClassificationModel): a model instance
            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
        """

        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.model = model
        self.preprocessor = preprocessor

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
        """process the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        """
        sys_rsp = self.preprocessor.text_field.tokenizer.convert_ids_to_tokens(
            inputs['resp'])
        assert len(sys_rsp) > 2
        sys_rsp = sys_rsp[1:len(sys_rsp) - 1]
        # sys_rsp = self.preprocessor.text_field.tokenizer.

        inputs['sys'] = sys_rsp

        return inputs
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,16 +1,20 @@
 from typing import Dict, Optional, Union
 from typing import Any, Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.preprocessors import TextGenerationPreprocessor
 from modelscope.utils.constant import Tasks
 import torch

 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import PalmForTextGeneration
 from ...preprocessors import TextGenerationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

 __all__ = ['TextGenerationPipeline']


@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
@PIPELINES.register_module(
    Tasks.text_generation, module_name=Pipelines.text_generation)
 class TextGenerationPipeline(Pipeline):

    def __init__(self,
@@ -31,10 +35,17 @@ class TextGenerationPipeline(Pipeline):
                model.tokenizer,
                first_sequence='sentence',
                second_sequence=None)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = model.tokenizer

    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self, inputs: Dict[str, Tensor],
                    **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,9 +1,12 @@
 from typing import Any, Dict, Optional, Union

 from modelscope.models import Model
 from modelscope.models.nlp import StructBertForTokenClassification
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
 import torch

 from ...metainfo import Pipelines
 from ...models import Model
 from ...models.nlp import SbertForTokenClassification
 from ...preprocessors import TokenClassifcationPreprocessor
 from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES

@@ -11,12 +14,11 @@ __all__ = ['WordSegmentationPipeline']


@PIPELINES.register_module(
    Tasks.word_segmentation,
    module_name=r'structbert-chinese-word-segmentation')
    Tasks.word_segmentation, module_name=Pipelines.word_segmentation)
 class WordSegmentationPipeline(Pipeline):

    def __init__(self,
                 model: Union[StructBertForTokenClassification, str],
                 model: Union[SbertForTokenClassification, str],
                 preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                 **kwargs):
        """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
@@ -27,15 +29,23 @@ class WordSegmentationPipeline(Pipeline):
        """
        model = model if isinstance(
            model,
            StructBertForTokenClassification) else Model.from_pretrained(model)
            SbertForTokenClassification) else Model.from_pretrained(model)
        if preprocessor is None:
            preprocessor = TokenClassifcationPreprocessor(model.model_dir)
        model.eval()
        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
        self.tokenizer = preprocessor.tokenizer
        self.config = model.config
        assert len(self.config.id2label) > 0
        self.id2label = self.config.id2label

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        with torch.no_grad():
            return super().forward(inputs, **forward_params)

    def postprocess(self, inputs: Dict[str, Any],
                    **postprocess_params) -> Dict[str, str]:
        """process the prediction results

        Args:
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -45,6 +45,12 @@ TASK_OUTPUTS = {
    Tasks.image_matting: ['output_png'],
    Tasks.image_generation: ['output_png'],

    # action recognition result for single video
    # {
    #   "output_label": "abseiling"
    # }
    Tasks.action_recognition: ['output_label'],

    # pose estimation result for single sample
    # {
    #   "poses": np.array with shape [num_pose, num_keypoint, 3],
@@ -54,6 +60,13 @@ TASK_OUTPUTS = {
    # }
    Tasks.pose_estimation: ['poses', 'boxes'],

    # ocr detection result for single sample
    # {
    #   "det_polygons": np.array with shape [num_text, 8], each box is
    #       [x1, y1, x2, y2, x3, y3, x4, y4]
    # }
    Tasks.ocr_detection: ['det_polygons'],

    # ============ nlp tasks ===================

    # text classification result for single sample
@@ -69,6 +82,12 @@ TASK_OUTPUTS = {
    # }
    Tasks.text_generation: ['text'],

    # fill mask result for single sample
    # {
    #   "text": "this is the text which masks filled by model."
    # }
    Tasks.fill_mask: ['text'],

    # word segmentation result for single sample
    # {
    #   "output": "今天 天气 不错 ， 适合 出去 游玩"
@@ -82,6 +101,20 @@ TASK_OUTPUTS = {
    #   }
    Tasks.sentence_similarity: ['scores', 'labels'],

    # sentiment classification result for single sample
    #   {
    #       "labels": ["happy", "sad", "calm", "angry"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.sentiment_classification: ['scores', 'labels'],

    # nli result for single sample
    #   {
    #       "labels": ["happy", "sad", "calm", "angry"],
    #       "scores": [0.9, 0.1, 0.05, 0.05]
    #   }
    Tasks.nli: ['scores', 'labels'],

    # ============ audio tasks ===================

    # audio processed for single file in PCM format
--- a/modelscope/pipelines/util.py
+++ b/modelscope/pipelines/util.py
@@ -2,8 +2,8 @@
 import os.path as osp
 from typing import List, Union

 from maas_hub.file_download import model_file_download

 from modelscope.hub.api import HubApi
 from modelscope.hub.file_download import model_file_download
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
@@ -20,31 +20,63 @@ def is_config_has_model(cfg_file):
        return False


 def is_model_name(model: Union[str, List]):
    """ whether model is a valid modelhub path
 def is_official_hub_path(path: Union[str, List]):
    """ Whether path is a official hub name or a valid local
    path to official hub directory.
    """

    def is_official_hub_impl(path):
        if osp.exists(path):
            cfg_file = osp.join(path, ModelFile.CONFIGURATION)
            return osp.exists(cfg_file)
        else:
            try:
                _ = HubApi().get_model(path)
                return True
            except Exception:
                return False

    if isinstance(path, str):
        return is_official_hub_impl(path)
    else:
        results = [is_official_hub_impl(m) for m in path]
        all_true = all(results)
        any_true = any(results)
        if any_true and not all_true:
            raise ValueError(
                f'some model are hub address, some are not, model list: {path}'
            )

        return all_true


 def is_model(path: Union[str, List]):
    """ whether path is a valid modelhub path and containing model config
    """

    def is_model_name_impl(model):
        if osp.exists(model):
            cfg_file = osp.join(model, ModelFile.CONFIGURATION)
    def is_modelhub_path_impl(path):
        if osp.exists(path):
            cfg_file = osp.join(path, ModelFile.CONFIGURATION)
            if osp.exists(cfg_file):
                return is_config_has_model(cfg_file)
            else:
                return False
        else:
            try:
                cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
                cfg_file = model_file_download(path, ModelFile.CONFIGURATION)
                return is_config_has_model(cfg_file)
            except Exception:
                return False

    if isinstance(model, str):
        return is_model_name_impl(model)
    if isinstance(path, str):
        return is_modelhub_path_impl(path)
    else:
        results = [is_model_name_impl(m) for m in model]
        results = [is_modelhub_path_impl(m) for m in path]
        all_true = all(results)
        any_true = any(results)
        if any_true and not all_true:
            raise ValueError('some model are hub address, some are not')
            raise ValueError(
                f'some models are hub address, some are not, model list: {path}'
            )

        return all_true
--- a/modelscope/preprocessors/init.py
+++ b/modelscope/preprocessors/init.py
@@ -1,12 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 from .audio import LinearAECAndFbank
 # from .audio import LinearAECAndFbank
 from .base import Preprocessor
 from .builder import PREPROCESSORS, build_preprocessor
 # from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
 from .image import LoadImage, load_image
 from .nlp import *  # noqa F403
 from .space.dialog_intent_prediction_preprocessor import *  # noqa F403
 from .space.dialog_modeling_preprocessor import *  # noqa F403
 from .space.dialog_state_tracking_preprocessor import *  # noqa F403
 from .text_to_speech import *  # noqa F403

 # from .text_to_speech import *  # noqa F403
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -5,11 +5,12 @@ from typing import Dict, Union
 from PIL import Image, ImageOps

 from modelscope.fileio import File
 from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS


@PREPROCESSORS.register_module(Fields.cv)
@PREPROCESSORS.register_module(Fields.cv, Preprocessors.load_image)
 class LoadImage:
    """Load an image from file or url.
    Added or updated keys are "filename", "img", "img_shape",
--- a/modelscope/pipelines/multi_modal/image_caption_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_caption_pipeline.py
@@ -1,32 +1,48 @@
 from typing import Any, Dict
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
 from typing import Any, Dict, Union

 import numpy as np
 import torch
 from PIL import Image

 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from ..base import Pipeline
 from ..builder import PIPELINES
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields, ModelFile
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 from .image import load_image

 logger = get_logger()
 __all__ = [
    'OfaImageCaptionPreprocessor',
 ]


@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa')
 class ImageCaptionPipeline(Pipeline):
    # TODO: refine using modelhub
    def __init__(self, model: str, bpe_dir: str):
        super().__init__()
        # turn on cuda if GPU is available
@PREPROCESSORS.register_module(
    Fields.multi_modal, module_name=Preprocessors.ofa_image_caption)
 class OfaImageCaptionPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)

        if osp.exists(model_dir):
            local_model_dir = model_dir
        else:
            local_model_dir = snapshot_download(model_dir)
        local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE)
        bpe_dir = local_model_dir

        from fairseq import checkpoint_utils, tasks, utils
        from ofa.tasks.mm_tasks import CaptionTask

        tasks.register_task('caption', CaptionTask)
        use_cuda = False
        # use fp16 only when GPU is available
        use_fp16 = False

        overrides = {
            'bpe_dir': bpe_dir,
            'eval_cider': False,
@@ -35,21 +51,9 @@ class ImageCaptionPipeline(Pipeline):
            'no_repeat_ngram_size': 3,
            'seed': 7
        }
        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            utils.split_paths(model), arg_overrides=overrides)

        # Move models to GPU
        for model in models:
            model.eval()
            if use_cuda:
                model.cuda()
            if use_fp16:
                model.half()
            model.prepare_for_inference_(cfg)
        self.models = models
        # Initialize generator
        self.generator = task.build_generator(models, cfg.generation)

        model, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
            utils.split_paths(local_model), arg_overrides=overrides)
        del model
        # Initialize transform
        from torchvision import transforms
        mean = [0.5, 0.5, 0.5]
@@ -69,7 +73,8 @@ class ImageCaptionPipeline(Pipeline):
        self.eos_item = torch.LongTensor([task.src_dict.eos()])
        self.pad_idx = task.src_dict.pad()

    def preprocess(self, input: Input) -> Dict[str, Any]:
    @type_assert(object, (str, tuple, Image.Image))
    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:

        def encode_text(text, length=None, append_bos=False, append_eos=False):
            s = self.task.tgt_dict.encode_line(
@@ -84,11 +89,11 @@ class ImageCaptionPipeline(Pipeline):
                s = torch.cat([s, self.eos_item])
            return s

        if isinstance(input, Image.Image):
            patch_image = self.patch_resize_transform(input).unsqueeze(0)
        if isinstance(data, Image.Image):
            patch_image = self.patch_resize_transform(data).unsqueeze(0)
        else:
            patch_image = self.patch_resize_transform(
                load_image(input)).unsqueeze(0)
                load_image(data)).unsqueeze(0)
        patch_mask = torch.tensor([True])
        text = 'what does the image describe?'
        src_text = encode_text(
@@ -105,17 +110,3 @@ class ImageCaptionPipeline(Pipeline):
            }
        }
        return sample

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        from ofa.utils.eval_utils import eval_caption

        results, _ = eval_caption(self.task, self.generator, self.models,
                                  input)
        return {
            'image_id': results[0]['image_id'],
            'caption': results[0]['caption']
        }

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        # What should we do here ?
        return inputs
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,14 +5,17 @@ from typing import Any, Dict, Union

 from transformers import AutoTokenizer

 from modelscope.utils.constant import Fields, InputFields
 from modelscope.utils.type_assert import type_assert
 from ..metainfo import Models, Preprocessors
 from ..utils.constant import Fields, InputFields
 from ..utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS

 __all__ = [
    'Tokenize', 'SequenceClassificationPreprocessor',
    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor',
    'NLIPreprocessor', 'SentimentClassificationPreprocessor',
    'FillMaskPreprocessor'
 ]


@@ -31,7 +34,141 @@ class Tokenize(Preprocessor):


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=r'bert-sequence-classification')
    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
 class NLIPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """

        super().__init__(*args, **kwargs)

        from sofa import SbertTokenizer
        self.model_dir: str = model_dir
        self.first_sequence: str = kwargs.pop('first_sequence',
                                              'first_sequence')
        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
        self.sequence_length = kwargs.pop('sequence_length', 128)

        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)

    @type_assert(object, tuple)
    def __call__(self, data: tuple) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (tuple): [sentence1, sentence2]
                sentence1 (str): a sentence
                    Example:
                        'you are so handsome.'
                sentence2 (str): a sentence
                    Example:
                        'you are so beautiful.'
        Returns:
            Dict[str, Any]: the preprocessed data
        """
        sentence1, sentence2 = data
        new_data = {
            self.first_sequence: sentence1,
            self.second_sequence: sentence2
        }
        # preprocess the data for the model input

        rst = {
            'id': [],
            'input_ids': [],
            'attention_mask': [],
            'token_type_ids': []
        }

        max_seq_length = self.sequence_length

        text_a = new_data[self.first_sequence]
        text_b = new_data[self.second_sequence]
        feature = self.tokenizer(
            text_a,
            text_b,
            padding=False,
            truncation=True,
            max_length=max_seq_length)

        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        rst['token_type_ids'].append(feature['token_type_ids'])

        return rst


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
 class SentimentClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """

        super().__init__(*args, **kwargs)

        from sofa import SbertTokenizer
        self.model_dir: str = model_dir
        self.first_sequence: str = kwargs.pop('first_sequence',
                                              'first_sequence')
        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
        self.sequence_length = kwargs.pop('sequence_length', 128)

        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'
        Returns:
            Dict[str, Any]: the preprocessed data
        """

        new_data = {self.first_sequence: data}
        # preprocess the data for the model input

        rst = {
            'id': [],
            'input_ids': [],
            'attention_mask': [],
            'token_type_ids': []
        }

        max_seq_length = self.sequence_length

        text_a = new_data[self.first_sequence]

        text_b = new_data.get(self.second_sequence, None)
        feature = self.tokenizer(
            text_a,
            text_b,
            padding='max_length',
            truncation=True,
            max_length=max_seq_length)

        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        rst['token_type_ids'].append(feature['token_type_ids'])

        return rst


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 class SequenceClassificationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
@@ -53,12 +190,12 @@ class SequenceClassificationPreprocessor(Preprocessor):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
        print(f'this is the tokenzier {self.tokenizer}')

    @type_assert(object, (str, tuple))
    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
    @type_assert(object, (str, tuple, Dict))
    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str or tuple):
            data (str or tuple, Dict):
            sentence1 (str): a sentence
                    Example:
                        'you are so handsome.'
@@ -70,22 +207,31 @@ class SequenceClassificationPreprocessor(Preprocessor):
                sentence2 (str): a sentence
                    Example:
                        'you are so beautiful.'
            or
            {field1: field_value1, field2: field_value2}
            field1 (str): field name, default 'first_sequence'
            field_value1 (str): a sentence
                    Example:
                        'you are so handsome.'

            field2 (str): field name, default 'second_sequence'
            field_value2 (str): a sentence
                Example:
                    'you are so beautiful.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """

        if not isinstance(data, tuple):
            data = (
                data,
                None,
            )

        sentence1, sentence2 = data
        new_data = {
            self.first_sequence: sentence1,
            self.second_sequence: sentence2
        }
        if isinstance(data, str):
            new_data = {self.first_sequence: data}
        elif isinstance(data, tuple):
            sentence1, sentence2 = data
            new_data = {
                self.first_sequence: sentence1,
                self.second_sequence: sentence2
            }
        else:
            new_data = data

        # preprocess the data for the model input

@@ -115,7 +261,8 @@ class SequenceClassificationPreprocessor(Preprocessor):
        return rst


@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
@PREPROCESSORS.register_module(
    Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer)
 class TextGenerationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
@@ -166,12 +313,66 @@ class TextGenerationPreprocessor(Preprocessor):

        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        return {k: torch.tensor(v) for k, v in rst.items()}


@PREPROCESSORS.register_module(Fields.nlp)
 class FillMaskPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
        """preprocess the data via the vocab.txt from the `model_dir` path

        Args:
            model_dir (str): model path
        """
        super().__init__(*args, **kwargs)
        from sofa.utils.backend import AutoTokenizer
        self.model_dir = model_dir
        self.first_sequence: str = kwargs.pop('first_sequence',
                                              'first_sequence')
        self.sequence_length = kwargs.pop('sequence_length', 128)

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_dir, use_fast=False)

    @type_assert(object, str)
    def __call__(self, data: str) -> Dict[str, Any]:
        """process the raw input data

        Args:
            data (str): a sentence
                Example:
                    'you are so handsome.'

        Returns:
            Dict[str, Any]: the preprocessed data
        """
        import torch

        new_data = {self.first_sequence: data}
        # preprocess the data for the model input

        rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}

        max_seq_length = self.sequence_length

        text_a = new_data[self.first_sequence]
        feature = self.tokenizer(
            text_a,
            padding='max_length',
            truncation=True,
            max_length=max_seq_length,
            return_token_type_ids=True)

        rst['input_ids'].append(feature['input_ids'])
        rst['attention_mask'].append(feature['attention_mask'])
        rst['token_type_ids'].append(feature['token_type_ids'])

        return {k: torch.tensor(v) for k, v in rst.items()}


@PREPROCESSORS.register_module(
    Fields.nlp, module_name=r'bert-token-classification')
    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
 class TokenClassifcationPreprocessor(Preprocessor):

    def __init__(self, model_dir: str, *args, **kwargs):
@@ -199,6 +400,7 @@ class TokenClassifcationPreprocessor(Preprocessor):
        Returns:
            Dict[str, Any]: the preprocessed data
        """

        # preprocess the data for the model input

        text = data.replace(' ', '').strip()
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
@@ -3,13 +3,12 @@
 import os
 from typing import Any, Dict

 from modelscope.preprocessors.space.fields.intent_field import \
    IntentBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields
 from modelscope.utils.type_assert import type_assert
 from ...utils.config import Config
 from ...utils.constant import Fields
 from ...utils.type_assert import type_assert
 from ..base import Preprocessor
 from ..builder import PREPROCESSORS
 from .fields.intent_field import IntentBPETextField

 __all__ = ['DialogIntentPredictionPreprocessor']

--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -1,16 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import uuid
 from typing import Any, Dict, Union

 from modelscope.preprocessors.space.fields.gen_field import \
    MultiWOZBPETextField
 from modelscope.utils.config import Config
 from modelscope.utils.constant import Fields, InputFields
 from modelscope.utils.type_assert import type_assert
 from typing import Any, Dict

 from ...utils.config import Config
 from ...utils.constant import Fields
 from ...utils.type_assert import type_assert
 from ..base import Preprocessor
 from ..builder import PREPROCESSORS
 from .fields.gen_field import MultiWOZBPETextField

 __all__ = ['DialogModelingPreprocessor']

--- a/modelscope/preprocessors/space/fields/dst_processors.py
+++ b/modelscope/preprocessors/space/fields/dst_processors.py
@@ -154,14 +154,16 @@ utter3 = {
    'User-2':
    'I am looking for an expensive indian restaurant in the area of centre.',
    'System-2':
    'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant in the center of town. I can book a table for you, if you like.',
    'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant '
    'in the center of town. I can book a table for you, if you like.',
    'Dialog_Act-2': {
        'Restaurant-Recommend': [['area', 'center of town'],
                                 ['food', 'Indian'],
                                 ['name', 'Saffron Brasserie'],
                                 ['pricerange', 'expensive']]
    },
    'User-3': 'Sure thing, please book for 6 people at 19:30 on Saturday.'
    'User-3':
    'Sure thing, please book for 6 people at 19:30 on Saturday.'
 }

 history_states3 = [{}, {
@@ -346,7 +348,6 @@ history_states3 = [{}, {


 class DSTProcessor(object):

    ACTS_DICT = {
        'taxi-depart': 'taxi-departure',
        'taxi-dest': 'taxi-destination',
@@ -380,7 +381,8 @@ class DSTProcessor(object):

    def _convert_inputs_to_utterances(self, inputs: dict,
                                      history_states: list):
        """This method is to generate the utterances with user, sys, dialog_acts and metadata, while metadata is from the history_states or the output from the inference pipline"""
        """This method is to generate the utterances with user, sys, dialog_acts and metadata,
         while metadata is from the history_states or the output from the inference pipline"""

        utterances = []
        user_inputs = []
@@ -427,8 +429,8 @@ class DSTProcessor(object):
            if isinstance(item, dict):
                for a in item:
                    aa = a.lower().split('-')
                    if aa[1] == 'inform' or aa[1] == 'recommend' or aa[
                            1] == 'select' or aa[1] == 'book':
                    if aa[1] == 'inform' or aa[1] == 'recommend' or \
                            aa[1] == 'select' or aa[1] == 'book':
                        for i in item[a]:
                            s = i[0].lower()
                            v = i[1].lower().strip()
@@ -443,7 +445,7 @@ class DSTProcessor(object):
                            if key not in s_dict:
                                s_dict[key] = list([v])
                            # ... Option 2: Keep last informed value
                            #s_dict[key] = list([v])
                            # s_dict[key] = list([v])

        return s_dict

@@ -454,26 +456,26 @@ class multiwoz22Processor(DSTProcessor):
        super().__init__()

    def normalize_time(self, text):
        text = re.sub('(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2',
        text = re.sub(r'(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2',
                      text)  # am/pm without space
        text = re.sub('(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3',
        text = re.sub(r'(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3',
                      text)  # am/pm short to long form
        text = re.sub(
            '(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)',
            r'(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)',
            r'\1\2 \3:\4\5', text)  # Missing separator
        text = re.sub('(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3',
        text = re.sub(r'(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3',
                      text)  # Wrong separator
        text = re.sub('(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)',
        text = re.sub(r'(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)',
                      r'\1\2 \3:00\4', text)  # normalize simple full hour time
        text = re.sub('(^| )(\d{1}:\d{2})', r'\g<1>0\2',
        text = re.sub(r'(^| )(\d{1}:\d{2})', r'\g<1>0\2',
                      text)  # Add missing leading 0
        # Map 12 hour times to 24 hour times
        text = re.sub(
            '(\d{2})(:\d{2}) ?p\.?m\.?', lambda x: str(
                int(x.groups()[0]) + 12
                if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups(
                )[1], text)
        text = re.sub('(^| )24:(\d{2})', r'\g<1>00:\2',
        text = \
            re.sub(
                r'(\d{2})(:\d{2}) ?p\.?m\.?',
                lambda x: str(int(x.groups()[0]) + 12
                              if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups()[1], text)
        text = re.sub(r'(^| )24:(\d{2})', r'\g<1>00:\2',
                      text)  # Correct times that use 24 as hour
        return text

@@ -508,8 +510,8 @@ class multiwoz22Processor(DSTProcessor):
                if isinstance(acts[d][t]['dialog_act'], dict):
                    for a in acts[d][t]['dialog_act']:
                        aa = a.lower().split('-')
                        if aa[1] == 'inform' or aa[1] == 'recommend' or aa[
                                1] == 'select' or aa[1] == 'book':
                        if aa[1] == 'inform' or aa[1] == 'recommend' \
                                or aa[1] == 'select' or aa[1] == 'book':
                            for i in acts[d][t]['dialog_act'][a]:
                                s = i[0].lower()
                                v = i[1].lower().strip()
@@ -524,7 +526,7 @@ class multiwoz22Processor(DSTProcessor):
                                if key not in s_dict:
                                    s_dict[key] = list([v])
                                # ... Option 2: Keep last informed value
                                #s_dict[key] = list([v])
                                # s_dict[key] = list([v])
        return s_dict

    # This should only contain label normalizations. All other mappings should
@@ -560,7 +562,7 @@ class multiwoz22Processor(DSTProcessor):
        utt_lower = convert_to_unicode(utt).lower()
        utt_lower = self.normalize_text(utt_lower)
        utt_tok = [
            tok for tok in map(str.strip, re.split('(\W+)', utt_lower))
            tok for tok in map(str.strip, re.split(r'(\W+)', utt_lower))
            if len(tok) > 0
        ]
        return utt_tok
@@ -582,7 +584,7 @@ class multiwoz22Processor(DSTProcessor):
        find_pos = []
        found = False
        label_list = [
            item for item in map(str.strip, re.split('(\W+)', value_label))
            item for item in map(str.strip, re.split(r'(\W+)', value_label))
            if len(item) > 0
        ]
        len_label = len(label_list)
@@ -633,11 +635,11 @@ class multiwoz22Processor(DSTProcessor):
    def is_in_list(self, tok, value):
        found = False
        tok_list = [
            item for item in map(str.strip, re.split('(\W+)', tok))
            item for item in map(str.strip, re.split(r'(\W+)', tok))
            if len(item) > 0
        ]
        value_list = [
            item for item in map(str.strip, re.split('(\W+)', value))
            item for item in map(str.strip, re.split(r'(\W+)', value))
            if len(item) > 0
        ]
        tok_len = len(tok_list)
@@ -938,8 +940,8 @@ class multiwoz22Processor(DSTProcessor):
                        if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[
                                slot]:
                            print('(%s): %s, ' % (slot, value_label), end='')
                elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[
                        slot] and class_type != 'copy_value' and class_type != 'inform':
                elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] \
                        and class_type != 'copy_value' and class_type != 'inform':
                    # If slot has seen before and its class type did not change, label this slot a not present,
                    # assuming that the slot has not actually been mentioned in this turn.
                    # Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform,
@@ -1262,7 +1264,7 @@ def convert_examples_to_features(examples,

    def _get_start_end_pos(class_type, token_label_ids, max_seq_length):
        if class_type == 'copy_value' and 1 not in token_label_ids:
            #logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.")
            # logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.")
            class_type = 'none'
        start_pos = 0
        end_pos = 0
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -8,10 +8,10 @@ from itertools import chain

 import numpy as np

 from modelscope.preprocessors.space.tokenizer import Tokenizer
 from modelscope.utils.nlp.space import ontology, utils
 from modelscope.utils.nlp.space.db_ops import MultiWozDB
 from modelscope.utils.nlp.space.utils import list2np
 from ....utils.nlp.space import ontology, utils
 from ....utils.nlp.space.db_ops import MultiWozDB
 from ....utils.nlp.space.utils import list2np
 from ..tokenizer import Tokenizer


 class BPETextField(object):
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -14,10 +14,10 @@ import json
 import numpy as np
 from tqdm import tqdm

 from modelscope.preprocessors.space.tokenizer import Tokenizer
 from modelscope.utils.nlp.space import ontology, utils
 from modelscope.utils.nlp.space.scores import hierarchical_set_score
 from modelscope.utils.nlp.space.utils import list2np
 from ....utils.nlp.space import ontology, utils
 from ....utils.nlp.space.scores import hierarchical_set_score
 from ....utils.nlp.space.utils import list2np
 from ..tokenizer import Tokenizer


 class BPETextField(object):
--- a/modelscope/preprocessors/text_to_speech.py
+++ b/modelscope/preprocessors/text_to_speech.py
@@ -2,9 +2,8 @@
 import io
 from typing import Any, Dict, Union

 import ttsfrd

 from modelscope.fileio import File
 from modelscope.metainfo import Preprocessors
 from modelscope.models.audio.tts.frontend import GenericTtsFrontend
 from modelscope.models.base import Model
 from modelscope.utils.audio.tts_exceptions import *  # noqa F403
@@ -12,11 +11,11 @@ from modelscope.utils.constant import Fields
 from .base import Preprocessor
 from .builder import PREPROCESSORS

 __all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols']
 __all__ = ['TextToTacotronSymbols']


@PREPROCESSORS.register_module(
    Fields.audio, module_name=r'text_to_tacotron_symbols')
    Fields.audio, module_name=Preprocessors.text_to_tacotron_symbols)
 class TextToTacotronSymbols(Preprocessor):
    """extract tacotron symbols from text.

--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -0,0 +1,232 @@
 import math
 import os
 import random

 import decord
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.utils.data
 import torch.utils.dlpack as dlpack
 import torchvision.transforms._transforms_video as transforms
 from decord import VideoReader
 from torchvision.transforms import Compose


 def ReadVideoData(cfg, video_path):
    """ simple interface to load video frames from file

    Args:
        cfg (Config): The global config object.
        video_path (str): video file path
    """
    data = _decode_video(cfg, video_path)
    transform = kinetics400_tranform(cfg)
    data_list = []
    for i in range(data.size(0)):
        for j in range(cfg.TEST.NUM_SPATIAL_CROPS):
            transform.transforms[1].set_spatial_index(j)
            data_list.append(transform(data[i]))
    return torch.stack(data_list, dim=0)


 def kinetics400_tranform(cfg):
    """
    Configs the transform for the kinetics-400 dataset.
    We apply controlled spatial cropping and normalization.
    Args:
        cfg (Config): The global config object.
    """
    resize_video = KineticsResizedCrop(
        short_side_range=[cfg.DATA.TEST_SCALE, cfg.DATA.TEST_SCALE],
        crop_size=cfg.DATA.TEST_CROP_SIZE,
        num_spatial_crops=cfg.TEST.NUM_SPATIAL_CROPS)
    std_transform_list = [
        transforms.ToTensorVideo(), resize_video,
        transforms.NormalizeVideo(
            mean=cfg.DATA.MEAN, std=cfg.DATA.STD, inplace=True)
    ]
    return Compose(std_transform_list)


 def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
                             num_clips, num_frames, interval, minus_interval):
    """
        Generates the frame index list using interval based sampling.
        Args:
            vid_length  (int): the length of the whole video (valid selection range).
            vid_fps     (int): the original video fps
            target_fps  (int): the normalized video fps
            clip_idx    (int): -1 for random temporal sampling, and positive values for
                                sampling specific clip from the video
            num_clips   (int): the total clips to be sampled from each video.
                                combined with clip_idx, the sampled video is the "clip_idx-th"
                                 video from "num_clips" videos.
            num_frames  (int): number of frames in each sampled clips.
            interval    (int): the interval to sample each frame.
            minus_interval (bool): control the end index
        Returns:
            index (tensor): the sampled frame indexes
        """
    if num_frames == 1:
        index = [random.randint(0, vid_length - 1)]
    else:
        # transform FPS
        clip_length = num_frames * interval * vid_fps / target_fps

        max_idx = max(vid_length - clip_length, 0)
        start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
        if minus_interval:
            end_idx = start_idx + clip_length - interval
        else:
            end_idx = start_idx + clip_length - 1

        index = torch.linspace(start_idx, end_idx, num_frames)
        index = torch.clamp(index, 0, vid_length - 1).long()

    return index


 def _decode_video_frames_list(cfg, frames_list, vid_fps):
    """
        Decodes the video given the numpy frames.
        Args:
            cfg          (Config): The global config object.
            frames_list  (list):  all frames for a video, the frames should be numpy array.
            vid_fps      (int):  the fps of this video.
        Returns:
            frames            (Tensor): video tensor data
    """
    assert isinstance(frames_list, list)
    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS

    frame_list = []
    for clip_idx in range(num_clips_per_video):
        # for each clip in the video,
        # a list is generated before decoding the specified frames from the video
        list_ = _interval_based_sampling(
            len(frames_list), vid_fps, cfg.DATA.TARGET_FPS, clip_idx,
            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
        frames = None
        frames = torch.from_numpy(
            np.stack([frames_list[l_index] for l_index in list_.tolist()],
                     axis=0))
        frame_list.append(frames)
    frames = torch.stack(frame_list)
    if num_clips_per_video == 1:
        frames = frames.squeeze(0)

    return frames


 def _decode_video(cfg, path):
    """
        Decodes the video given the numpy frames.
        Args:
            path          (str): video file path.
        Returns:
            frames            (Tensor): video tensor data
    """
    vr = VideoReader(path)

    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS

    frame_list = []
    for clip_idx in range(num_clips_per_video):
        # for each clip in the video,
        # a list is generated before decoding the specified frames from the video
        list_ = _interval_based_sampling(
            len(vr), vr.get_avg_fps(), cfg.DATA.TARGET_FPS, clip_idx,
            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
        frames = None
        if path.endswith('.avi'):
            append_list = torch.arange(0, list_[0], 4)
            frames = dlpack.from_dlpack(
                vr.get_batch(torch.cat([append_list,
                                        list_])).to_dlpack()).clone()
            frames = frames[append_list.shape[0]:]
        else:
            frames = dlpack.from_dlpack(
                vr.get_batch(list_).to_dlpack()).clone()
        frame_list.append(frames)
    frames = torch.stack(frame_list)
    if num_clips_per_video == 1:
        frames = frames.squeeze(0)
    del vr
    return frames


 class KineticsResizedCrop(object):
    """Perform resize and crop for kinetics-400 dataset
    Args:
        short_side_range (list): The length of short side range. In inference, this shoudle be [256, 256]
        crop_size         (int): The cropped size for frames.
        num_spatial_crops (int): The number of the cropped spatial regions in each video.
    """

    def __init__(
        self,
        short_side_range,
        crop_size,
        num_spatial_crops=1,
    ):
        self.idx = -1
        self.short_side_range = short_side_range
        self.crop_size = int(crop_size)
        self.num_spatial_crops = num_spatial_crops

    def _get_controlled_crop(self, clip):
        """Perform controlled crop for video tensor.
        Args:
            clip (Tensor): the video data, the shape is [T, C, H, W]
        """
        _, _, clip_height, clip_width = clip.shape

        length = self.short_side_range[0]

        if clip_height < clip_width:
            new_clip_height = int(length)
            new_clip_width = int(clip_width / clip_height * new_clip_height)
            new_clip = torch.nn.functional.interpolate(
                clip, size=(new_clip_height, new_clip_width), mode='bilinear')
        else:
            new_clip_width = int(length)
            new_clip_height = int(clip_height / clip_width * new_clip_width)
            new_clip = torch.nn.functional.interpolate(
                clip, size=(new_clip_height, new_clip_width), mode='bilinear')
        x_max = int(new_clip_width - self.crop_size)
        y_max = int(new_clip_height - self.crop_size)
        if self.num_spatial_crops == 1:
            x = x_max // 2
            y = y_max // 2
        elif self.num_spatial_crops == 3:
            if self.idx == 0:
                if new_clip_width == length:
                    x = x_max // 2
                    y = 0
                elif new_clip_height == length:
                    x = 0
                    y = y_max // 2
            elif self.idx == 1:
                x = x_max // 2
                y = y_max // 2
            elif self.idx == 2:
                if new_clip_width == length:
                    x = x_max // 2
                    y = y_max
                elif new_clip_height == length:
                    x = x_max
                    y = y_max // 2
        return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]

    def set_spatial_index(self, idx):
        """Set the spatial cropping index for controlled cropping..
        Args:
            idx (int): the spatial index. The value should be in [0, 1, 2], means [left, center, right], respectively.
        """
        self.idx = idx

    def __call__(self, clip):
        return self._get_controlled_crop(clip)
--- a/modelscope/pydatasets/config.py
+++ b/modelscope/pydatasets/config.py
@@ -0,0 +1,22 @@
 import os
 from pathlib import Path

 # Cache location
 DEFAULT_CACHE_HOME = '~/.cache'
 CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
 DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
 MS_CACHE_HOME = os.path.expanduser(
    os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME))

 DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets')
 MS_DATASETS_CACHE = Path(
    os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE))

 DOWNLOADED_DATASETS_DIR = 'downloads'
 DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
                                                DOWNLOADED_DATASETS_DIR)
 DOWNLOADED_DATASETS_PATH = Path(
    os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))

 MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
                                 'http://101.201.119.157:31752')
--- a/modelscope/pydatasets/py_dataset.py
+++ b/modelscope/pydatasets/py_dataset.py
@@ -1,64 +1,81 @@
 from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
                    Union)
 import os
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                    Sequence, Union)

 from datasets import Dataset, load_dataset
 import numpy as np
 from datasets import Dataset
 from datasets import load_dataset as hf_load_dataset
 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
 from datasets.utils.file_utils import (is_relative_path,
                                       relative_to_absolute_path)

 from modelscope.pydatasets.config import MS_DATASETS_CACHE
 from modelscope.pydatasets.utils.ms_api import MsApi
 from modelscope.utils.constant import Hubs
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 def format_list(para) -> List:
    if para is None:
        para = []
    elif isinstance(para, str):
        para = [para]
    elif len(set(para)) < len(para):
        raise ValueError(f'List columns contains duplicates: {para}')
    return para


 class PyDataset:
    _hf_ds = None  # holds the underlying HuggingFace Dataset
    """A PyDataset backed by hugging face Dataset."""

    def __init__(self, hf_ds: Dataset):
    def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
        self._hf_ds = hf_ds
        self.target = None
        self.target = target

    def __iter__(self):
        if isinstance(self._hf_ds, Dataset):
            for item in self._hf_ds:
                if self.target is not None:
                    yield item[self.target]
                else:
                    yield item
        else:
            for ds in self._hf_ds.values():
                for item in ds:
                    if self.target is not None:
                        yield item[self.target]
                    else:
                        yield item
        for item in self._hf_ds:
            if self.target is not None:
                yield item[self.target]
            else:
                yield item

    def __getitem__(self, key):
        return self._hf_ds[key]

    @classmethod
    def from_hf_dataset(cls,
                        hf_ds: Dataset,
                        target: str = None) -> 'PyDataset':
        dataset = cls(hf_ds)
        dataset.target = target
        return dataset
                        target: str = None) -> Union[dict, 'PyDataset']:
        if isinstance(hf_ds, Dataset):
            return cls(hf_ds, target)
        if len(hf_ds.keys()) == 1:
            return cls(next(iter(hf_ds.values())), target)
        return {k: cls(v, target) for k, v in hf_ds.items()}

    @staticmethod
    def load(path: Union[str, list],
             target: Optional[str] = None,
             version: Optional[str] = None,
             name: Optional[str] = None,
             split: Optional[str] = None,
             data_dir: Optional[str] = None,
             data_files: Optional[Union[str, Sequence[str],
                                        Mapping[str,
                                                Union[str,
                                                      Sequence[str]]]]] = None,
             hub: Optional[Hubs] = None) -> 'PyDataset':
    def load(
        dataset_name: Union[str, list],
        target: Optional[str] = None,
        version: Optional[str] = None,
        hub: Optional[Hubs] = Hubs.modelscope,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        data_dir: Optional[str] = None,
        data_files: Optional[Union[str, Sequence[str],
                                   Mapping[str, Union[str,
                                                      Sequence[str]]]]] = None
    ) -> Union[dict, 'PyDataset']:
        """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
            Args:

                path (str): Path or name of the dataset.
                dataset_name (str): Path or name of the dataset.
                target (str, optional): Name of the column to output.
                version (str, optional): Version of the dataset script to load:
                name (str, optional): Defining the subset_name of the dataset.
                subset_name (str, optional): Defining the subset_name of the dataset.
                data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                split (str, optional): Which split of the data to load.
@@ -67,53 +84,302 @@ class PyDataset:
            Returns:
                PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
            """
        if Hubs.modelscope == hub:
            # TODO: parse data meta information from modelscope hub
            # and possibly download data files to local (and update path)
            print('getting data from modelscope hub')
        if isinstance(path, str):
            dataset = load_dataset(
                path,
                name=name,
        if hub == Hubs.huggingface:
            dataset = hf_load_dataset(
                dataset_name,
                name=subset_name,
                revision=version,
                split=split,
                data_dir=data_dir,
                data_files=data_files)
        elif isinstance(path, list):
            return PyDataset.from_hf_dataset(dataset, target=target)
        else:
            return PyDataset._load_ms_dataset(
                dataset_name,
                target=target,
                subset_name=subset_name,
                version=version,
                split=split,
                data_dir=data_dir,
                data_files=data_files)

    @staticmethod
    def _load_ms_dataset(
        dataset_name: Union[str, list],
        target: Optional[str] = None,
        version: Optional[str] = None,
        subset_name: Optional[str] = None,
        split: Optional[str] = None,
        data_dir: Optional[str] = None,
        data_files: Optional[Union[str, Sequence[str],
                                   Mapping[str, Union[str,
                                                      Sequence[str]]]]] = None
    ) -> Union[dict, 'PyDataset']:
        if isinstance(dataset_name, str):
            use_hf = False
            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
                    (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
                use_hf = True
            elif is_relative_path(dataset_name):
                ms_api = MsApi()
                dataset_scripts = ms_api.fetch_dataset_scripts(
                    dataset_name, version)
                if 'py' in dataset_scripts:  # dataset copied from hf datasets
                    dataset_name = dataset_scripts['py'][0]
                    use_hf = True
            else:
                raise FileNotFoundError(
                    f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
                    f'or any data file in the same directory.')

            if use_hf:
                dataset = hf_load_dataset(
                    dataset_name,
                    name=subset_name,
                    revision=version,
                    split=split,
                    data_dir=data_dir,
                    data_files=data_files,
                    cache_dir=MS_DATASETS_CACHE)
            else:
                # TODO load from ms datahub
                raise NotImplementedError(
                    f'Dataset {dataset_name} load from modelscope datahub to be implemented in '
                    f'the future')
        elif isinstance(dataset_name, list):
            if target is None:
                target = 'target'
            dataset = Dataset.from_dict({target: [p] for p in path})
            dataset = Dataset.from_dict({target: dataset_name})
        else:
            raise TypeError('path must be a str or a list, but got'
                            f' {type(path)}')
                            f' {type(dataset_name)}')
        return PyDataset.from_hf_dataset(dataset, target=target)

    def to_torch_dataset_with_processors(
        self,
        preprocessors: Union[Callable, List[Callable]],
        columns: Union[str, List[str]] = None,
    ):
        preprocessor_list = preprocessors if isinstance(
            preprocessors, list) else [preprocessors]

        columns = format_list(columns)

        columns = [
            key for key in self._hf_ds.features.keys() if key in columns
        ]
        sample = next(iter(self._hf_ds))

        sample_res = {k: np.array(sample[k]) for k in columns}
        for processor in preprocessor_list:
            sample_res.update(
                {k: np.array(v)
                 for k, v in processor(sample).items()})

        def is_numpy_number(value):
            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
                value.dtype, np.floating)

        retained_columns = []
        for k in sample_res.keys():
            if not is_numpy_number(sample_res[k]):
                logger.warning(
                    f'Data of column {k} is non-numeric, will be removed')
                continue
            retained_columns.append(k)

        import torch

        class MsIterableDataset(torch.utils.data.IterableDataset):

            def __init__(self, dataset: Iterable):
                super(MsIterableDataset).__init__()
                self.dataset = dataset

            def __iter__(self):
                for item_dict in self.dataset:
                    res = {
                        k: np.array(item_dict[k])
                        for k in columns if k in retained_columns
                    }
                    for preprocessor in preprocessor_list:
                        res.update({
                            k: np.array(v)
                            for k, v in preprocessor(item_dict).items()
                            if k in retained_columns
                        })
                    yield res

        return MsIterableDataset(self._hf_ds)

    def to_torch_dataset(
        self,
        columns: Union[str, List[str]] = None,
        output_all_columns: bool = False,
        preprocessors: Union[Callable, List[Callable]] = None,
        **format_kwargs,
    ):
        self._hf_ds.reset_format()
        self._hf_ds.set_format(
            type='torch',
            columns=columns,
            output_all_columns=output_all_columns,
            format_kwargs=format_kwargs)
        return self._hf_ds
        """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
           torch.utils.data.DataLoader.

        Args:
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
                will be used as a field of torch.utils.data.Dataset.
            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
                the output fields of processors will also be added.
            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.

        Returns:
            :class:`tf.data.Dataset`

        """
        if not TORCH_AVAILABLE:
            raise ImportError(
                'The function to_torch_dataset requires pytorch to be installed'
            )
        if preprocessors is not None:
            return self.to_torch_dataset_with_processors(preprocessors)
        else:
            self._hf_ds.reset_format()
            self._hf_ds.set_format(
                type='torch', columns=columns, format_kwargs=format_kwargs)
            return self._hf_ds

    def to_tf_dataset_with_processors(
        self,
        batch_size: int,
        shuffle: bool,
        preprocessors: Union[Callable, List[Callable]],
        drop_remainder: bool = None,
        prefetch: bool = True,
        label_cols: Union[str, List[str]] = None,
        columns: Union[str, List[str]] = None,
    ):
        preprocessor_list = preprocessors if isinstance(
            preprocessors, list) else [preprocessors]

        label_cols = format_list(label_cols)
        columns = format_list(columns)
        cols_to_retain = list(set(label_cols + columns))
        retained_columns = [
            key for key in self._hf_ds.features.keys() if key in cols_to_retain
        ]
        import tensorflow as tf
        tf_dataset = tf.data.Dataset.from_tensor_slices(
            np.arange(len(self._hf_ds), dtype=np.int64))
        if shuffle:
            tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds))

        def func(i, return_dict=False):
            i = int(i)
            res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns}
            for preprocessor in preprocessor_list:
                # TODO preprocessor output may have the same key
                res.update({
                    k: np.array(v)
                    for k, v in preprocessor(self._hf_ds[i]).items()
                })
            if return_dict:
                return res
            return tuple(list(res.values()))

        sample_res = func(0, True)

        @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
        def fetch_function(i):
            output = tf.numpy_function(
                func,
                inp=[i],
                Tout=[
                    tf.dtypes.as_dtype(val.dtype)
                    for val in sample_res.values()
                ],
            )
            return {key: output[i] for i, key in enumerate(sample_res)}

        tf_dataset = tf_dataset.map(
            fetch_function, num_parallel_calls=tf.data.AUTOTUNE)
        if label_cols:

            def split_features_and_labels(input_batch):
                labels = {
                    key: tensor
                    for key, tensor in input_batch.items() if key in label_cols
                }
                if len(input_batch) == 1:
                    input_batch = next(iter(input_batch.values()))
                if len(labels) == 1:
                    labels = next(iter(labels.values()))
                return input_batch, labels

            tf_dataset = tf_dataset.map(split_features_and_labels)

        elif len(columns) == 1:
            tf_dataset = tf_dataset.map(lambda x: next(iter(x.values())))
        if batch_size > 1:
            tf_dataset = tf_dataset.batch(
                batch_size, drop_remainder=drop_remainder)

        if prefetch:
            tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
        return tf_dataset

    def to_tf_dataset(
        self,
        columns: Union[str, List[str]],
        batch_size: int,
        shuffle: bool,
        collate_fn: Callable,
        preprocessors: Union[Callable, List[Callable]] = None,
        columns: Union[str, List[str]] = None,
        collate_fn: Callable = None,
        drop_remainder: bool = None,
        collate_fn_args: Dict[str, Any] = None,
        label_cols: Union[str, List[str]] = None,
        dummy_labels: bool = False,
        prefetch: bool = True,
    ):
        """Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
           model.fit() or model.predict().

        Args:
            batch_size (int): Number of samples in a single batch.
            shuffle(bool): Shuffle the dataset order.
            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
                every sample of the dataset. The output type of processors is dict, and each field of the dict will be
                used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
                shouldn't be None.
            columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
                the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
                processors will also be added.
            collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
                the `preprocessors` is None, the `collate_fn` shouldn't be None.
            drop_remainder(bool, default None): Drop the last incomplete batch when loading.
            collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
            label_cols (str or List[str], defalut None): Dataset column(s) to load as labels.
            prefetch (bool, default True): Prefetch data.

        Returns:
            :class:`tf.data.Dataset`

        """
        if not TF_AVAILABLE:
            raise ImportError(
                'The function to_tf_dataset requires Tensorflow to be installed.'
            )
        if preprocessors is not None:
            return self.to_tf_dataset_with_processors(
                batch_size,
                shuffle,
                preprocessors,
                drop_remainder=drop_remainder,
                prefetch=prefetch,
                label_cols=label_cols,
                columns=columns)

        if collate_fn is None:
            logger.error(
                'The `preprocessors` and the `collate_fn` should`t be both None.'
            )
            return None
        self._hf_ds.reset_format()
        return self._hf_ds.to_tf_dataset(
            columns,
@@ -123,7 +389,6 @@ class PyDataset:
            drop_remainder=drop_remainder,
            collate_fn_args=collate_fn_args,
            label_cols=label_cols,
            dummy_labels=dummy_labels,
            prefetch=prefetch)

    def to_hf_dataset(self) -> Dataset:
--- a/modelscope/pydatasets/utils/init.py
+++ b/modelscope/pydatasets/utils/init.py
--- a/modelscope/pydatasets/utils/ms_api.py
+++ b/modelscope/pydatasets/utils/ms_api.py
@@ -0,0 +1,66 @@
 import os
 from collections import defaultdict
 from typing import Optional

 import requests

 from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH,
                                          MS_HUB_ENDPOINT)
 from modelscope.utils.logger import get_logger

 logger = get_logger()


 class MsApi:

    def __init__(self, endpoint=MS_HUB_ENDPOINT):
        self.endpoint = endpoint

    def list_datasets(self):
        path = f'{self.endpoint}/api/v1/datasets'
        headers = None
        params = {}
        r = requests.get(path, params=params, headers=headers)
        r.raise_for_status()
        dataset_list = r.json()['Data']
        return [x['Name'] for x in dataset_list]

    def fetch_dataset_scripts(self,
                              dataset_name: str,
                              version: Optional[str] = 'master',
                              force_download=False):
        datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
        r = requests.get(datahub_url)
        r.raise_for_status()
        dataset_list = r.json()['Data']
        if len(dataset_list) == 0:
            return None
        dataset_id = dataset_list[0]['Id']
        version = version or 'master'
        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
        r = requests.get(datahub_url)
        r.raise_for_status()
        file_list = r.json()['Data']['Files']
        cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
                                 version)
        os.makedirs(cache_dir, exist_ok=True)
        local_paths = defaultdict(list)
        for file_info in file_list:
            file_path = file_info['Path']
            if file_path.endswith('.py'):
                datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
                              f'Revision={version}&Path={file_path}'
                r = requests.get(datahub_url)
                r.raise_for_status()
                content = r.json()['Data']['Content']
                local_path = os.path.join(cache_dir, file_path)
                if os.path.exists(local_path) and not force_download:
                    logger.warning(
                        f"Reusing dataset {dataset_name}'s python file ({local_path})"
                    )
                    local_paths['py'].append(local_path)
                    continue
                with open(local_path, 'w') as f:
                    f.writelines(content)
                local_paths['py'].append(local_path)
        return local_paths
--- a/modelscope/trainers/nlp/space/trainers/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainers/gen_trainer.py
@@ -13,7 +13,7 @@ import torch
 from tqdm import tqdm
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup

 import modelscope.utils.nlp.space.ontology as ontology
 from .....utils.nlp.space import ontology
 from ..metrics.metrics_tracker import MetricsTracker


--- a/modelscope/trainers/nlp/space/trainers/intent_trainer.py
+++ b/modelscope/trainers/nlp/space/trainers/intent_trainer.py
@@ -14,9 +14,7 @@ import torch
 from tqdm import tqdm
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup

 from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
    MetricsTracker
 from modelscope.utils.nlp.space.args import str2bool
 from ..metrics.metrics_tracker import MetricsTracker


 def get_logger(log_path, name='default'):
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -28,9 +28,13 @@ class Tasks(object):
    image_editing = 'image-editing'
    image_generation = 'image-generation'
    image_matting = 'image-matting'
    ocr_detection = 'ocr-detection'
    action_recognition = 'action-recognition'

    # nlp tasks
    word_segmentation = 'word-segmentation'
    nli = 'nli'
    sentiment_classification = 'sentiment-classification'
    sentiment_analysis = 'sentiment-analysis'
    sentence_similarity = 'sentence-similarity'
    text_classification = 'text-classification'
@@ -45,8 +49,7 @@ class Tasks(object):
    dialog_state_tracking = 'dialog-state-tracking'
    table_question_answering = 'table-question-answering'
    feature_extraction = 'feature-extraction'
    sentence_similarity = 'sentence-similarity'
    fill_mask = 'fill-mask '
    fill_mask = 'fill-mask'
    summarization = 'summarization'
    question_answering = 'question-answering'

--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -1,14 +1,67 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import os.path as osp
 from typing import List, Optional, Union

 from maas_hub.constants import MODEL_ID_SEPARATOR
 from requests import HTTPError

 from modelscope.hub.file_download import model_file_download
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile

 # temp solution before the hub-cache is in place
 def get_model_cache_dir(model_id: str, branch: str = 'master'):
    model_id_expanded = model_id.replace('/',
                                         MODEL_ID_SEPARATOR) + '.' + branch
    default_cache_dir = os.path.expanduser(os.path.join('~/.cache', 'maas'))
    return os.getenv('MAAS_CACHE',
                     os.path.join(default_cache_dir, 'hub', model_id_expanded))

 def create_model_if_not_exist(
        api,
        model_id: str,
        chinese_name: str,
        visibility: Optional[int] = 5,  # 1-private, 5-public
        license: Optional[str] = 'apache-2.0',
        revision: Optional[str] = 'master'):
    exists = True
    try:
        api.get_model(model_id=model_id, revision=revision)
    except HTTPError:
        exists = False
    if exists:
        print(f'model {model_id} already exists, skip creation.')
        return False
    else:
        api.create_model(
            model_id=model_id,
            chinese_name=chinese_name,
            visibility=visibility,
            license=license)
        print(f'model {model_id} successfully created.')
        return True


 def read_config(model_id_or_path: str):
    """ Read config from hub or local path

    Args:
        model_id_or_path (str): Model repo name or local directory path.

    Return:
        config (:obj:`Config`): config object
    """
    if not os.path.exists(model_id_or_path):
        local_path = model_file_download(model_id_or_path,
                                         ModelFile.CONFIGURATION)
    else:
        local_path = os.path.join(model_id_or_path, ModelFile.CONFIGURATION)

    return Config.from_file(local_path)


 def auto_load(model: Union[str, List[str]]):
    if isinstance(model, str):
        if not osp.exists(model):
            model = snapshot_download(model)
    else:
        model = [
            snapshot_download(m) if not osp.exists(m) else m for m in model
        ]

    return model
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -78,7 +78,7 @@ class Registry(object):
                               f'{self._name}[{default_group}] and will '
                               'be overwritten')
                logger.warning(f'{self._modules[default_group][module_name]}'
                               'to {module_cls}')
                               f'to {module_cls}')
        # also register module in the default group for faster access
        # only by module name
        self._modules[default_group][module_name] = module_cls
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -2,6 +2,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.

 import os
 import unittest

 from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE

 TEST_LEVEL = 2
 TEST_LEVEL_STR = 'TEST_LEVEL'
@@ -15,6 +18,18 @@ def test_level():
    return TEST_LEVEL


 def require_tf(test_case):
    if not TF_AVAILABLE:
        test_case = unittest.skip('test requires TensorFlow')(test_case)
    return test_case


 def require_torch(test_case):
    if not TORCH_AVAILABLE:
        test_case = unittest.skip('test requires PyTorch')(test_case)
    return test_case


 def set_test_level(level: int):
    global TEST_LEVEL
    TEST_LEVEL = level
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,25 +1,25 @@
 #tts
 h5py==2.10.0
 #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl
 https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D
 #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl
 #https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl
 h5py
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/pytorch_wavelets-1.3.0-py3-none-any.whl
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp36-cp36m-linux_x86_64.whl; python_version=='3.6'
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp37-cp37m-linux_x86_64.whl; python_version=='3.7'
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp38-cp38-linux_x86_64.whl; python_version=='3.8'
 https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp39-cp39-linux_x86_64.whl; python_version=='3.9'
 inflect
 keras==2.2.4
 keras
 librosa
 lxml
 matplotlib
 nara_wpe
 numpy==1.18.*
 protobuf==3.20.*
 numpy
 protobuf>3,<=3.20
 ptflops
 PyWavelets>=1.0.0
 scikit-learn==0.23.2
 scikit-learn
 sox
 tensorboard
 tensorflow==1.15.*
 torch==1.10.*
 torch
 torchaudio
 torchvision
 tqdm
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1 +1,3 @@
 decord>=0.6.0
 easydict
 tf_slim
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,4 +1,5 @@
 https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
 # https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
 http://ait-public.oss-cn-hangzhou-zmf.aliyuncs.com/jizhu/en_core_web_sm-2.3.1.tar.gz
 https://alinlp.alibaba-inc.com/pypi/sofa-1.0.3-py3-none-any.whl
 spacy>=2.3.5
 # python -m spacy download en_core_web_sm
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,13 +1,16 @@
 addict
 datasets
 easydict
 https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
 filelock>=3.3.0
 numpy
 opencv-python-headless
 Pillow>=6.2.0
 pyyaml
 requests
 requests==2.27.1
 scipy
 setuptools==58.0.4
 tokenizers<=0.10.3
 tqdm>=4.64.0
 transformers<=4.16.2
 yapf