diff --git a/data/test/images/image_captioning.png b/data/test/images/image_captioning.png
new file mode 100644
index 00000000..de3f1918
--- /dev/null
+++ b/data/test/images/image_captioning.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af83a94899a6d23339c3ecc5c4c58c57c835af57b531a2f4c50461184f820141
+size 603621
diff --git a/data/test/images/ocr_detection.jpg b/data/test/images/ocr_detection.jpg
new file mode 100644
index 00000000..c347810e
--- /dev/null
+++ b/data/test/images/ocr_detection.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8435db5583400be5d11a2c17910c96133b462c8a99ccaf0e19f4aac34e0a94
+size 141149
diff --git a/data/test/videos/action_recognition_test_video.mp4 b/data/test/videos/action_recognition_test_video.mp4
new file mode 100644
index 00000000..9197b770
--- /dev/null
+++ b/data/test/videos/action_recognition_test_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24dc4237b1197321ee8486bb983fa01fd47e2b4afdb3c2df24229e5f2bd20119
+size 1475924
diff --git a/modelscope/pipelines/nlp/space/__init__.py b/modelscope/hub/__init__.py
similarity index 100%
rename from modelscope/pipelines/nlp/space/__init__.py
rename to modelscope/hub/__init__.py
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
new file mode 100644
index 00000000..104eafbd
--- /dev/null
+++ b/modelscope/hub/api.py
@@ -0,0 +1,265 @@
+import imp
+import os
+import pickle
+import subprocess
+from http.cookiejar import CookieJar
+from os.path import expanduser
+from typing import List, Optional, Tuple, Union
+
+import requests
+
+from modelscope.utils.logger import get_logger
+from .constants import LOGGER_NAME
+from .errors import NotExistError, is_ok, raise_on_error
+from .utils.utils import get_endpoint, model_id_to_group_owner_name
+
+logger = get_logger()
+
+
+class HubApi:
+
+    def __init__(self, endpoint=None):
+        self.endpoint = endpoint if endpoint is not None else get_endpoint()
+
+    def login(
+        self,
+        user_name: str,
+        password: str,
+    ) -> tuple():
+        """
+        Login with username and password
+
+        Args:
+            username(`str`): user name on modelscope
+            password(`str`): password
+
+        Returns:
+            cookies: to authenticate yourself to ModelScope open-api
+            gitlab token: to access private repos
+
+        <Tip>
+            You only have to login once within 30 days.
+        </Tip>
+
+        TODO: handle cookies expire
+
+        """
+        path = f'{self.endpoint}/api/v1/login'
+        r = requests.post(
+            path, json={
+                'username': user_name,
+                'password': password
+            })
+        r.raise_for_status()
+        d = r.json()
+        raise_on_error(d)
+
+        token = d['Data']['AccessToken']
+        cookies = r.cookies
+
+        # save token and cookie
+        ModelScopeConfig.save_token(token)
+        ModelScopeConfig.save_cookies(cookies)
+        ModelScopeConfig.write_to_git_credential(user_name, password)
+
+        return d['Data']['AccessToken'], cookies
+
+    def create_model(self, model_id: str, chinese_name: str, visibility: int,
+                     license: str) -> str:
+        """
+        Create model repo at ModelScopeHub
+
+        Args:
+            model_id:(`str`): The model id
+            chinese_name(`str`): chinese name of the model
+            visibility(`int`): visibility of the model(1-private, 3-internal, 5-public)
+            license(`str`): license of the model, candidates can be found at: TBA
+
+        Returns:
+            name of the model created
+
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise ValueError('Token does not exist, please login first.')
+
+        path = f'{self.endpoint}/api/v1/models'
+        owner_or_group, name = model_id_to_group_owner_name(model_id)
+        r = requests.post(
+            path,
+            json={
+                'Path': owner_or_group,
+                'Name': name,
+                'ChineseName': chinese_name,
+                'Visibility': visibility,
+                'License': license
+            },
+            cookies=cookies)
+        r.raise_for_status()
+        raise_on_error(r.json())
+        d = r.json()
+        return d['Data']['Name']
+
+    def delete_model(self, model_id):
+        """_summary_
+
+        Args:
+            model_id (str): The model id.
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        path = f'{self.endpoint}/api/v1/models/{model_id}'
+
+        r = requests.delete(path, cookies=cookies)
+        r.raise_for_status()
+        raise_on_error(r.json())
+
+    def get_model_url(self, model_id):
+        return f'{self.endpoint}/api/v1/models/{model_id}.git'
+
+    def get_model(
+        self,
+        model_id: str,
+        revision: str = 'master',
+    ) -> str:
+        """
+        Get model information at modelscope_hub
+
+        Args:
+            model_id(`str`): The model id.
+            revision(`str`): revision of model
+        Returns:
+            The model details information.
+        Raises:
+            NotExistError: If the model is not exist, will throw NotExistError
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        cookies = ModelScopeConfig.get_cookies()
+        owner_or_group, name = model_id_to_group_owner_name(model_id)
+        path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}?{revision}'
+
+        r = requests.get(path, cookies=cookies)
+        if r.status_code == 200:
+            if is_ok(r.json()):
+                return r.json()['Data']
+            else:
+                raise NotExistError(r.json()['Message'])
+        else:
+            r.raise_for_status()
+
+    def get_model_branches_and_tags(
+        self,
+        model_id: str,
+    ) -> Tuple[List[str], List[str]]:
+        cookies = ModelScopeConfig.get_cookies()
+
+        path = f'{self.endpoint}/api/v1/models/{model_id}/revisions'
+        r = requests.get(path, cookies=cookies)
+        r.raise_for_status()
+        d = r.json()
+        raise_on_error(d)
+        info = d['Data']
+        branches = [x['Revision'] for x in info['RevisionMap']['Branches']
+                    ] if info['RevisionMap']['Branches'] else []
+        tags = [x['Revision'] for x in info['RevisionMap']['Tags']
+                ] if info['RevisionMap']['Tags'] else []
+        return branches, tags
+
+    def get_model_files(
+            self,
+            model_id: str,
+            revision: Optional[str] = 'master',
+            root: Optional[str] = None,
+            recursive: Optional[str] = False,
+            use_cookies: Union[bool, CookieJar] = False) -> List[dict]:
+
+        cookies = None
+        if isinstance(use_cookies, CookieJar):
+            cookies = use_cookies
+        elif use_cookies:
+            cookies = ModelScopeConfig.get_cookies()
+            if cookies is None:
+                raise ValueError('Token does not exist, please login first.')
+
+        path = f'{self.endpoint}/api/v1/models/{model_id}/repo/files?Revision={revision}&Recursive={recursive}'
+        if root is not None:
+            path = path + f'&Root={root}'
+
+        r = requests.get(path, cookies=cookies)
+
+        r.raise_for_status()
+        d = r.json()
+        raise_on_error(d)
+
+        files = []
+        for file in d['Data']['Files']:
+            if file['Name'] == '.gitignore' or file['Name'] == '.gitattributes':
+                continue
+
+            files.append(file)
+        return files
+
+
+class ModelScopeConfig:
+    path_credential = expanduser('~/.modelscope/credentials')
+    os.makedirs(path_credential, exist_ok=True)
+
+    @classmethod
+    def save_cookies(cls, cookies: CookieJar):
+        with open(os.path.join(cls.path_credential, 'cookies'), 'wb+') as f:
+            pickle.dump(cookies, f)
+
+    @classmethod
+    def get_cookies(cls):
+        try:
+            with open(os.path.join(cls.path_credential, 'cookies'), 'rb') as f:
+                return pickle.load(f)
+        except FileNotFoundError:
+            logger.warn("Auth token does not exist, you'll get authentication \
+                error when downloading private model files. Please login first"
+                        )
+
+    @classmethod
+    def save_token(cls, token: str):
+        with open(os.path.join(cls.path_credential, 'token'), 'w+') as f:
+            f.write(token)
+
+    @classmethod
+    def get_token(cls) -> Optional[str]:
+        """
+        Get token or None if not existent.
+
+        Returns:
+            `str` or `None`: The token, `None` if it doesn't exist.
+
+        """
+        token = None
+        try:
+            with open(os.path.join(cls.path_credential, 'token'), 'r') as f:
+                token = f.read()
+        except FileNotFoundError:
+            pass
+        return token
+
+    @staticmethod
+    def write_to_git_credential(username: str, password: str):
+        with subprocess.Popen(
+                'git credential-store store'.split(),
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+        ) as process:
+            input_username = f'username={username.lower()}'
+            input_password = f'password={password}'
+
+            process.stdin.write(
+                f'url={get_endpoint()}\n{input_username}\n{input_password}\n\n'
+                .encode('utf-8'))
+            process.stdin.flush()
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
new file mode 100644
index 00000000..a38f9afb
--- /dev/null
+++ b/modelscope/hub/constants.py
@@ -0,0 +1,8 @@
+MODELSCOPE_URL_SCHEME = 'http://'
+DEFAULT_MODELSCOPE_DOMAIN = '101.201.119.157:32330'
+DEFAULT_MODELSCOPE_GITLAB_DOMAIN = '101.201.119.157:31102'
+
+DEFAULT_MODELSCOPE_GROUP = 'damo'
+MODEL_ID_SEPARATOR = '/'
+
+LOGGER_NAME = 'ModelScopeHub'
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
new file mode 100644
index 00000000..13ea709f
--- /dev/null
+++ b/modelscope/hub/errors.py
@@ -0,0 +1,30 @@
+class NotExistError(Exception):
+    pass
+
+
+class RequestError(Exception):
+    pass
+
+
+def is_ok(rsp):
+    """ Check the request is ok
+
+    Args:
+        rsp (_type_): The request response body
+        Failed: {'Code': 10010101004, 'Message': 'get model info failed, err: unauthorized permission',
+                 'RequestId': '', 'Success': False}
+        Success: {'Code': 200, 'Data': {}, 'Message': 'success', 'RequestId': '', 'Success': True}
+    """
+    return rsp['Code'] == 200 and rsp['Success']
+
+
+def raise_on_error(rsp):
+    """If response error, raise exception
+
+    Args:
+        rsp (_type_): The server response
+    """
+    if rsp['Code'] == 200 and rsp['Success']:
+        return True
+    else:
+        raise RequestError(rsp['Message'])
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
new file mode 100644
index 00000000..e5c64f1c
--- /dev/null
+++ b/modelscope/hub/file_download.py
@@ -0,0 +1,254 @@
+import copy
+import fnmatch
+import logging
+import os
+import sys
+import tempfile
+import time
+from functools import partial
+from hashlib import sha256
+from pathlib import Path
+from typing import BinaryIO, Dict, Optional, Union
+from uuid import uuid4
+
+import json
+import requests
+from filelock import FileLock
+from requests.exceptions import HTTPError
+from tqdm import tqdm
+
+from modelscope import __version__
+from modelscope.utils.logger import get_logger
+from .api import HubApi, ModelScopeConfig
+from .constants import (DEFAULT_MODELSCOPE_GROUP, LOGGER_NAME,
+                        MODEL_ID_SEPARATOR)
+from .errors import NotExistError, RequestError, raise_on_error
+from .utils.caching import ModelFileSystemCache
+from .utils.utils import (get_cache_dir, get_endpoint,
+                          model_id_to_group_owner_name)
+
+SESSION_ID = uuid4().hex
+logger = get_logger()
+
+
+def model_file_download(
+    model_id: str,
+    file_path: str,
+    revision: Optional[str] = 'master',
+    cache_dir: Optional[str] = None,
+    user_agent: Union[Dict, str, None] = None,
+    local_files_only: Optional[bool] = False,
+) -> Optional[str]:  # pragma: no cover
+    """
+    Download from a given URL and cache it if it's not already present in the
+    local cache.
+
+    Given a URL, this function looks for the corresponding file in the local
+    cache. If it's not there, download it. Then return the path to the cached
+    file.
+
+    Args:
+        model_id (`str`):
+            The model to whom the file to be downloaded belongs.
+        file_path(`str`):
+            Path of the file to be downloaded, relative to the root of model repo
+        revision(`str`, *optional*):
+            revision of the model file to be downloaded.
+            Can be any of a branch, tag or commit hash, default to `master`
+        cache_dir (`str`, `Path`, *optional*):
+            Path to the folder where cached files are stored.
+        user_agent (`dict`, `str`, *optional*):
+            The user-agent info in the form of a dictionary or a string.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, avoid downloading the file and return the path to the
+            local cached file if it exists.
+            if `False`, download the file anyway even it exists
+
+    Returns:
+        Local path (string) of file or if networking is off, last version of
+        file cached on disk.
+
+    <Tip>
+
+    Raises the following errors:
+
+        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+          if `use_auth_token=True` and the token cannot be found.
+        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
+          if ETag cannot be determined.
+        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+          if some parameter value is invalid
+
+    </Tip>
+    """
+    if cache_dir is None:
+        cache_dir = get_cache_dir()
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    group_or_owner, name = model_id_to_group_owner_name(model_id)
+
+    cache = ModelFileSystemCache(cache_dir, group_or_owner, name)
+
+    # if local_files_only is `True` and the file already exists in cached_path
+    # return the cached path
+    if local_files_only:
+        cached_file_path = cache.get_file_by_path(file_path)
+        if cached_file_path is not None:
+            logger.warning(
+                "File exists in local cache, but we're not sure it's up to date"
+            )
+            return cached_file_path
+        else:
+            raise ValueError(
+                'Cannot find the requested files in the cached path and outgoing'
+                ' traffic has been disabled. To enable model look-ups and downloads'
+                " online, set 'local_files_only' to False.")
+
+    _api = HubApi()
+    headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
+    branches, tags = _api.get_model_branches_and_tags(model_id)
+    file_to_download_info = None
+    is_commit_id = False
+    if revision in branches or revision in tags:  # The revision is version or tag,
+        # we need to confirm the version is up to date
+        # we need to get the file list to check if the lateast version is cached, if so return, otherwise download
+        model_files = _api.get_model_files(
+            model_id=model_id,
+            revision=revision,
+            recursive=True,
+        )
+
+        for model_file in model_files:
+            if model_file['Type'] == 'tree':
+                continue
+
+            if model_file['Path'] == file_path:
+                model_file['Branch'] = revision
+                if cache.exists(model_file):
+                    return cache.get_file_by_info(model_file)
+                else:
+                    file_to_download_info = model_file
+
+        if file_to_download_info is None:
+            raise NotExistError('The file path: %s not exist in: %s' %
+                                (file_path, model_id))
+    else:  # the revision is commit id.
+        cached_file_path = cache.get_file_by_path_and_commit_id(
+            file_path, revision)
+        if cached_file_path is not None:
+            logger.info('The specified file is in cache, skip downloading!')
+            return cached_file_path  # the file is in cache.
+        is_commit_id = True
+    # we need to download again
+    # TODO: skip using JWT for authorization, use cookie instead
+    cookies = ModelScopeConfig.get_cookies()
+    url_to_download = get_file_download_url(model_id, file_path, revision)
+    file_to_download_info = {
+        'Path': file_path,
+        'Revision':
+        revision if is_commit_id else file_to_download_info['Revision']
+    }
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache.get_root_location() + '.lock'
+
+    with FileLock(lock_path):
+        temp_file_name = next(tempfile._get_candidate_names())
+        http_get_file(
+            url_to_download,
+            cache_dir,
+            temp_file_name,
+            headers=headers,
+            cookies=None if cookies is None else cookies.get_dict())
+        return cache.put_file(file_to_download_info,
+                              os.path.join(cache_dir, temp_file_name))
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None, ) -> str:
+    """Formats a user-agent string with basic info about a request.
+
+    Args:
+        user_agent (`str`, `dict`, *optional*):
+            The user agent info in the form of a dictionary or a single string.
+
+    Returns:
+        The formatted user-agent string.
+    """
+    ua = f'modelscope/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}'
+
+    if isinstance(user_agent, dict):
+        ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua = user_agent
+    return ua
+
+
+def get_file_download_url(model_id: str, file_path: str, revision: str):
+    """
+    Format file download url according to `model_id`, `revision` and `file_path`.
+    e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
+    the resulted download url is: https://maas.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
+    """
+    download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
+    return download_url_template.format(
+        endpoint=get_endpoint(),
+        model_id=model_id,
+        revision=revision,
+        file_path=file_path,
+    )
+
+
+def http_get_file(
+    url: str,
+    local_dir: str,
+    file_name: str,
+    cookies: Dict[str, str],
+    headers: Optional[Dict[str, str]] = None,
+):
+    """
+    Download remote file. Do not gobble up errors.
+    This method is only used by snapshot_download, since the behavior is quite different with single file download
+    TODO: consolidate with http_get_file() to avoild duplicate code
+
+    Args:
+        url(`str`):
+            actual download url of the file
+        local_dir(`str`):
+            local directory where the downloaded file stores
+        file_name(`str`):
+            name of the file stored in `local_dir`
+        cookies(`Dict[str, str]`):
+            cookies used to authentication the user, which is used for downloading private repos
+        headers(`Optional[Dict[str, str]] = None`):
+            http headers to carry necessary info when requesting the remote file
+
+    """
+    temp_file_manager = partial(
+        tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False)
+
+    with temp_file_manager() as temp_file:
+        logger.info('downloading %s to %s', url, temp_file.name)
+        headers = copy.deepcopy(headers)
+
+        r = requests.get(url, stream=True, headers=headers, cookies=cookies)
+        r.raise_for_status()
+
+        content_length = r.headers.get('Content-Length')
+        total = int(content_length) if content_length is not None else None
+
+        progress = tqdm(
+            unit='B',
+            unit_scale=True,
+            unit_divisor=1024,
+            total=total,
+            initial=0,
+            desc='Downloading',
+        )
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
+                progress.update(len(chunk))
+                temp_file.write(chunk)
+        progress.close()
+
+    logger.info('storing %s in cache at %s', url, local_dir)
+    os.replace(temp_file.name, os.path.join(local_dir, file_name))
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
new file mode 100644
index 00000000..5f079105
--- /dev/null
+++ b/modelscope/hub/git.py
@@ -0,0 +1,82 @@
+from threading import local
+from tkinter.messagebox import NO
+from typing import Union
+
+from modelscope.utils.logger import get_logger
+from .constants import LOGGER_NAME
+from .utils._subprocess import run_subprocess
+
+logger = get_logger
+
+
+def git_clone(
+    local_dir: str,
+    repo_url: str,
+):
+    # TODO: use "git clone" or "git lfs clone" according to git version
+    # TODO: print stderr when subprocess fails
+    run_subprocess(
+        f'git clone {repo_url}'.split(),
+        local_dir,
+        True,
+    )
+
+
+def git_checkout(
+    local_dir: str,
+    revsion: str,
+):
+    run_subprocess(f'git checkout {revsion}'.split(), local_dir)
+
+
+def git_add(local_dir: str, ):
+    run_subprocess(
+        'git add .'.split(),
+        local_dir,
+        True,
+    )
+
+
+def git_commit(local_dir: str, commit_message: str):
+    run_subprocess(
+        'git commit -v -m'.split() + [commit_message],
+        local_dir,
+        True,
+    )
+
+
+def git_push(local_dir: str, branch: str):
+    # check current branch
+    cur_branch = git_current_branch(local_dir)
+    if cur_branch != branch:
+        logger.error(
+            "You're trying to push to a different branch, please double check")
+        return
+
+    run_subprocess(
+        f'git push origin {branch}'.split(),
+        local_dir,
+        True,
+    )
+
+
+def git_current_branch(local_dir: str) -> Union[str, None]:
+    """
+    Get current branch name
+
+    Args:
+        local_dir(`str`): local model repo directory
+
+    Returns
+        branch name you're currently on
+    """
+    try:
+        process = run_subprocess(
+            'git rev-parse --abbrev-ref HEAD'.split(),
+            local_dir,
+            True,
+        )
+
+        return str(process.stdout).strip()
+    except Exception as e:
+        raise e
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
new file mode 100644
index 00000000..6367f903
--- /dev/null
+++ b/modelscope/hub/repository.py
@@ -0,0 +1,173 @@
+import os
+import subprocess
+from pathlib import Path
+from typing import Optional, Union
+
+from modelscope.utils.logger import get_logger
+from .api import ModelScopeConfig
+from .constants import MODELSCOPE_URL_SCHEME
+from .git import git_add, git_checkout, git_clone, git_commit, git_push
+from .utils._subprocess import run_subprocess
+from .utils.utils import get_gitlab_domain
+
+logger = get_logger()
+
+
+class Repository:
+
+    def __init__(
+        self,
+        local_dir: str,
+        clone_from: Optional[str] = None,
+        auth_token: Optional[str] = None,
+        private: Optional[bool] = False,
+        revision: Optional[str] = 'master',
+    ):
+        """
+        Instantiate a Repository object by cloning the remote ModelScopeHub repo
+        Args:
+            local_dir(`str`):
+                local directory to store the model files
+            clone_from(`Optional[str] = None`):
+                model id in ModelScope-hub from which git clone
+                You should ignore this parameter when `local_dir` is already a git repo
+            auth_token(`Optional[str]`):
+                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
+                as the token is already saved when you login the first time
+            private(`Optional[bool]`):
+                whether the model is private, default to False
+            revision(`Optional[str]`):
+                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
+        """
+        logger.info('Instantiating Repository object...')
+
+        # Create local directory if not exist
+        os.makedirs(local_dir, exist_ok=True)
+        self.local_dir = os.path.join(os.getcwd(), local_dir)
+
+        self.private = private
+
+        # Check git and git-lfs installation
+        self.check_git_versions()
+
+        # Retrieve auth token
+        if not private and isinstance(auth_token, str):
+            logger.warning(
+                'cloning a public repo with a token, which will be ignored')
+            self.token = None
+        else:
+            if isinstance(auth_token, str):
+                self.token = auth_token
+            else:
+                self.token = ModelScopeConfig.get_token()
+
+            if self.token is None:
+                raise EnvironmentError(
+                    'Token does not exist, the clone will fail for private repo.'
+                    'Please login first.')
+
+        # git clone
+        if clone_from is not None:
+            self.model_id = clone_from
+            logger.info('cloning model repo to %s ...', self.local_dir)
+            git_clone(self.local_dir, self.get_repo_url())
+        else:
+            if is_git_repo(self.local_dir):
+                logger.debug('[Repository] is a valid git repo')
+            else:
+                raise ValueError(
+                    'If not specifying `clone_from`, you need to pass Repository a'
+                    ' valid git clone.')
+
+        # git checkout
+        if isinstance(revision, str) and revision != 'master':
+            git_checkout(revision)
+
+    def push_to_hub(self,
+                    commit_message: str,
+                    revision: Optional[str] = 'master'):
+        """
+        Push changes changes to hub
+
+        Args:
+            commit_message(`str`):
+                commit message describing the changes, it's mandatory
+            revision(`Optional[str]`):
+                remote branch you want to push to, default to `master`
+
+        <Tip>
+            The function complains when local and remote branch are different, please be careful
+        </Tip>
+
+        """
+        git_add(self.local_dir)
+        git_commit(self.local_dir, commit_message)
+
+        logger.info('Pushing changes to repo...')
+        git_push(self.local_dir, revision)
+
+        # TODO: if git push fails, how to retry?
+
+    def check_git_versions(self):
+        """
+        Checks that `git` and `git-lfs` can be run.
+
+        Raises:
+            `EnvironmentError`: if `git` or `git-lfs` are not installed.
+        """
+        try:
+            git_version = run_subprocess('git --version'.split(),
+                                         self.local_dir).stdout.strip()
+        except FileNotFoundError:
+            raise EnvironmentError(
+                'Looks like you do not have git installed, please install.')
+
+        try:
+            lfs_version = run_subprocess('git-lfs --version'.split(),
+                                         self.local_dir).stdout.strip()
+        except FileNotFoundError:
+            raise EnvironmentError(
+                'Looks like you do not have git-lfs installed, please install.'
+                ' You can install from https://git-lfs.github.com/.'
+                ' Then run `git lfs install` (you only have to do this once).')
+        logger.info(git_version + '\n' + lfs_version)
+
+    def get_repo_url(self) -> str:
+        """
+        Get repo url to clone, according whether the repo is private or not
+        """
+        url = None
+
+        if self.private:
+            url = f'{MODELSCOPE_URL_SCHEME}oauth2:{self.token}@{get_gitlab_domain()}/{self.model_id}'
+        else:
+            url = f'{MODELSCOPE_URL_SCHEME}{get_gitlab_domain()}/{self.model_id}'
+
+        if not url:
+            raise ValueError(
+                'Empty repo url, please check clone_from parameter')
+
+        logger.debug('url to clone: %s', str(url))
+
+        return url
+
+
+def is_git_repo(folder: Union[str, Path]) -> bool:
+    """
+    Check if the folder is the root or part of a git repository
+
+    Args:
+        folder (`str`):
+            The folder in which to run the command.
+
+    Returns:
+        `bool`: `True` if the repository is part of a repository, `False`
+        otherwise.
+    """
+    folder_exists = os.path.exists(os.path.join(folder, '.git'))
+    git_branch = subprocess.run(
+        'git branch'.split(),
+        cwd=folder,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE)
+    return folder_exists and git_branch.returncode == 0
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
new file mode 100644
index 00000000..90d850f4
--- /dev/null
+++ b/modelscope/hub/snapshot_download.py
@@ -0,0 +1,125 @@
+import os
+import tempfile
+from glob import glob
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from modelscope.utils.logger import get_logger
+from .api import HubApi, ModelScopeConfig
+from .constants import DEFAULT_MODELSCOPE_GROUP, MODEL_ID_SEPARATOR
+from .errors import NotExistError, RequestError, raise_on_error
+from .file_download import (get_file_download_url, http_get_file,
+                            http_user_agent)
+from .utils.caching import ModelFileSystemCache
+from .utils.utils import get_cache_dir, model_id_to_group_owner_name
+
+logger = get_logger()
+
+
+def snapshot_download(model_id: str,
+                      revision: Optional[str] = 'master',
+                      cache_dir: Union[str, Path, None] = None,
+                      user_agent: Optional[Union[Dict, str]] = None,
+                      local_files_only: Optional[bool] = False,
+                      private: Optional[bool] = False) -> str:
+    """Download all files of a repo.
+    Downloads a whole snapshot of a repo's files at the specified revision. This
+    is useful when you want all files from a repo, because you don't know which
+    ones you will need a priori. All files are nested inside a folder in order
+    to keep their actual filename relative to that folder.
+
+    An alternative would be to just clone a repo but this would require that the
+    user always has git and git-lfs installed, and properly configured.
+    Args:
+        model_id (`str`):
+            A user or an organization name and a repo name separated by a `/`.
+        revision (`str`, *optional*):
+            An optional Git revision id which can be a branch name, a tag, or a
+            commit hash. NOTE: currently only branch and tag name is supported
+        cache_dir (`str`, `Path`, *optional*):
+            Path to the folder where cached files are stored.
+        user_agent (`str`, `dict`, *optional*):
+            The user-agent info in the form of a dictionary or a string.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, avoid downloading the file and return the path to the
+            local cached file if it exists.
+    Returns:
+        Local folder path (string) of repo snapshot
+
+    <Tip>
+    Raises the following errors:
+    - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+      if `use_auth_token=True` and the token cannot be found.
+    - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
+      ETag cannot be determined.
+    - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+      if some parameter value is invalid
+    </Tip>
+    """
+
+    if cache_dir is None:
+        cache_dir = get_cache_dir()
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    group_or_owner, name = model_id_to_group_owner_name(model_id)
+
+    cache = ModelFileSystemCache(cache_dir, group_or_owner, name)
+    if local_files_only:
+        if len(cache.cached_files) == 0:
+            raise ValueError(
+                'Cannot find the requested files in the cached path and outgoing'
+                ' traffic has been disabled. To enable model look-ups and downloads'
+                " online, set 'local_files_only' to False.")
+        logger.warn('We can not confirm the cached file is for revision: %s'
+                    % revision)
+        return cache.get_root_location(
+        )  # we can not confirm the cached file is for snapshot 'revision'
+    else:
+        # make headers
+        headers = {'user-agent': http_user_agent(user_agent=user_agent, )}
+        _api = HubApi()
+        # get file list from model repo
+        branches, tags = _api.get_model_branches_and_tags(model_id)
+        if revision not in branches and revision not in tags:
+            raise NotExistError('The specified branch or tag : %s not exist!'
+                                % revision)
+
+        model_files = _api.get_model_files(
+            model_id=model_id,
+            revision=revision,
+            recursive=True,
+            use_cookies=private)
+
+        cookies = None
+        if private:
+            cookies = ModelScopeConfig.get_cookies()
+
+        for model_file in model_files:
+            if model_file['Type'] == 'tree':
+                continue
+            # check model_file is exist in cache, if exist, skip download, otherwise download
+            if cache.exists(model_file):
+                logger.info(
+                    'The specified file is in cache, skip downloading!')
+                continue
+
+            # get download url
+            url = get_file_download_url(
+                model_id=model_id,
+                file_path=model_file['Path'],
+                revision=revision)
+
+            # First download to /tmp
+            http_get_file(
+                url=url,
+                local_dir=tempfile.gettempdir(),
+                file_name=model_file['Name'],
+                headers=headers,
+                cookies=None if cookies is None else cookies.get_dict())
+            # put file to cache
+            cache.put_file(
+                model_file,
+                os.path.join(tempfile.gettempdir(), model_file['Name']))
+
+        return os.path.join(cache.get_root_location())
diff --git a/tests/pipelines/nlp/__init__.py b/modelscope/hub/utils/__init__.py
similarity index 100%
rename from tests/pipelines/nlp/__init__.py
rename to modelscope/hub/utils/__init__.py
diff --git a/modelscope/hub/utils/_subprocess.py b/modelscope/hub/utils/_subprocess.py
new file mode 100644
index 00000000..77e9fc48
--- /dev/null
+++ b/modelscope/hub/utils/_subprocess.py
@@ -0,0 +1,40 @@
+import subprocess
+from typing import List
+
+
+def run_subprocess(command: List[str],
+                   folder: str,
+                   check=True,
+                   **kwargs) -> subprocess.CompletedProcess:
+    """
+    Method to run subprocesses. Calling this will capture the `stderr` and `stdout`,
+    please call `subprocess.run` manually in case you would like for them not to
+    be captured.
+
+    Args:
+        command (`List[str]`):
+            The command to execute as a list of strings.
+        folder (`str`):
+            The folder in which to run the command.
+        check (`bool`, *optional*, defaults to `True`):
+            Setting `check` to `True` will raise a `subprocess.CalledProcessError`
+            when the subprocess has a non-zero exit code.
+        kwargs (`Dict[str]`):
+            Keyword arguments to be passed to the `subprocess.run` underlying command.
+
+    Returns:
+        `subprocess.CompletedProcess`: The completed process.
+    """
+    if isinstance(command, str):
+        raise ValueError(
+            '`run_subprocess` should be called with a list of strings.')
+
+    return subprocess.run(
+        command,
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        check=check,
+        encoding='utf-8',
+        cwd=folder,
+        **kwargs,
+    )
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
new file mode 100644
index 00000000..ac258385
--- /dev/null
+++ b/modelscope/hub/utils/caching.py
@@ -0,0 +1,294 @@
+import hashlib
+import logging
+import os
+import pickle
+import tempfile
+import time
+from shutil import move, rmtree
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class FileSystemCache(object):
+    KEY_FILE_NAME = '.msc'
+    """Local file cache.
+    """
+
+    def __init__(
+        self,
+        cache_root_location: str,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        cache_location: str
+            The root location to store files.
+        """
+        os.makedirs(cache_root_location, exist_ok=True)
+        self.cache_root_location = cache_root_location
+        self.load_cache()
+
+    def get_root_location(self):
+        return self.cache_root_location
+
+    def load_cache(self):
+        """Read set of stored blocks from file
+        Args:
+            owner(`str`): individual or group username at modelscope, can be empty for official models
+            name(`str`): name of the model
+        Returns:
+            The model details information.
+        Raises:
+            NotExistError: If the model is not exist, will throw NotExistError
+            TODO: Error based error code.
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        self.cached_files = []
+        cache_keys_file_path = os.path.join(self.cache_root_location,
+                                            FileSystemCache.KEY_FILE_NAME)
+        if os.path.exists(cache_keys_file_path):
+            with open(cache_keys_file_path, 'rb') as f:
+                self.cached_files = pickle.load(f)
+
+    def save_cached_files(self):
+        """Save cache metadata."""
+        # save new meta to tmp and move to KEY_FILE_NAME
+        cache_keys_file_path = os.path.join(self.cache_root_location,
+                                            FileSystemCache.KEY_FILE_NAME)
+        # TODO: Sync file write
+        fd, fn = tempfile.mkstemp()
+        with open(fd, 'wb') as f:
+            pickle.dump(self.cached_files, f)
+        move(fn, cache_keys_file_path)
+
+    def get_file(self, key):
+        """Check the key is in the cache, if exist, return the file, otherwise return None.
+        Args:
+            key(`str`): The cache key.
+        Returns:
+            If file exist, return the cached file location, otherwise None.
+        Raises:
+            None
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        pass
+
+    def put_file(self, key, location):
+        """Put file to the cache,
+        Args:
+            key(`str`): The cache key
+            location(`str`): Location of the file, we will move the file to cache.
+        Returns:
+            The cached file path of the file.
+        Raises:
+            None
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        pass
+
+    def remove_key(self, key):
+        """Remove cache key in index, The file is removed manually
+
+        Args:
+            key (dict): The cache key.
+        """
+        self.cached_files.remove(key)
+        self.save_cached_files()
+
+    def exists(self, key):
+        for cache_file in self.cached_files:
+            if cache_file == key:
+                return True
+
+        return False
+
+    def clear_cache(self):
+        """Remove all files and metadat from the cache
+
+        In the case of multiple cache locations, this clears only the last one,
+        which is assumed to be the read/write one.
+        """
+        rmtree(self.cache_root_location)
+        self.load_cache()
+
+    def hash_name(self, key):
+        return hashlib.sha256(key.encode()).hexdigest()
+
+
+class ModelFileSystemCache(FileSystemCache):
+    """Local cache file layout
+       cache_root/owner/model_name/|individual cached files
+                                   |.mk: file, The cache index file
+       Save only one version for each file.
+    """
+
+    def __init__(self, cache_root, owner, name):
+        """Put file to the cache
+        Args:
+            cache_root(`str`): The modelscope local cache root(default: ~/.modelscope/cache/models/)
+            owner(`str`): The model owner.
+            name('str'): The name of the model
+            branch('str'): The branch of model
+            tag('str'): The tag of model
+        Returns:
+        Raises:
+            None
+        <Tip>
+            model_id = {owner}/{name}
+        </Tip>
+        """
+        super().__init__(os.path.join(cache_root, owner, name))
+
+    def get_file_by_path(self, file_path):
+        """Retrieve the cache if there is file match the path.
+        Args:
+            file_path (str): The file path in the model.
+        Returns:
+            path: the full path of the file.
+        """
+        for cached_file in self.cached_files:
+            if file_path == cached_file['Path']:
+                cached_file_path = os.path.join(self.cache_root_location,
+                                                cached_file['Path'])
+                if os.path.exists(cached_file_path):
+                    return cached_file_path
+                else:
+                    self.remove_key(cached_file)
+
+        return None
+
+    def get_file_by_path_and_commit_id(self, file_path, commit_id):
+        """Retrieve the cache if there is file match the path.
+        Args:
+            file_path (str): The file path in the model.
+            commit_id (str): The commit id of the file
+        Returns:
+            path: the full path of the file.
+        """
+        for cached_file in self.cached_files:
+            if file_path == cached_file['Path'] and \
+               (cached_file['Revision'].startswith(commit_id) or commit_id.startswith(cached_file['Revision'])):
+                cached_file_path = os.path.join(self.cache_root_location,
+                                                cached_file['Path'])
+                if os.path.exists(cached_file_path):
+                    return cached_file_path
+                else:
+                    self.remove_key(cached_file)
+
+        return None
+
+    def get_file_by_info(self, model_file_info):
+        """Check if exist cache file.
+
+        Args:
+            model_file_info (ModelFileInfo): The file information of the file.
+
+        Returns:
+            _type_: _description_
+        """
+        cache_key = self.__get_cache_key(model_file_info)
+        for cached_file in self.cached_files:
+            if cached_file == cache_key:
+                orig_path = os.path.join(self.cache_root_location,
+                                         cached_file['Path'])
+                if os.path.exists(orig_path):
+                    return orig_path
+                else:
+                    self.remove_key(cached_file)
+
+        return None
+
+    def __get_cache_key(self, model_file_info):
+        cache_key = {
+            'Path': model_file_info['Path'],
+            'Revision': model_file_info['Revision'],  # commit id
+        }
+        return cache_key
+
+    def exists(self, model_file_info):
+        """Check the file is cached or not.
+
+        Args:
+            model_file_info (CachedFileInfo): The cached file info
+
+        Returns:
+            bool: If exists return True otherwise False
+        """
+        key = self.__get_cache_key(model_file_info)
+        is_exists = False
+        for cached_key in self.cached_files:
+            if cached_key['Path'] == key['Path'] and (
+                    cached_key['Revision'].startswith(key['Revision'])
+                    or key['Revision'].startswith(cached_key['Revision'])):
+                is_exists = True
+        file_path = os.path.join(self.cache_root_location,
+                                 model_file_info['Path'])
+        if is_exists:
+            if os.path.exists(file_path):
+                return True
+            else:
+                self.remove_key(
+                    model_file_info)  # sameone may manual delete the file
+        return False
+
+    def remove_if_exists(self, model_file_info):
+        """We in cache, remove it.
+
+        Args:
+            model_file_info (ModelFileInfo): The model file information from server.
+        """
+        for cached_file in self.cached_files:
+            if cached_file['Path'] == model_file_info['Path']:
+                self.remove_key(cached_file)
+                file_path = os.path.join(self.cache_root_location,
+                                         cached_file['Path'])
+                if os.path.exists(file_path):
+                    os.remove(file_path)
+
+    def put_file(self, model_file_info, model_file_location):
+        """Put model on model_file_location to cache, the model first download to /tmp, and move to cache.
+
+        Args:
+            model_file_info (str): The file description returned by get_model_files
+                                      sample:
+                                    {
+                                        "CommitMessage": "add model\n",
+                                        "CommittedDate": 1654857567,
+                                        "CommitterName": "mulin.lyh",
+                                        "IsLFS": false,
+                                        "Mode": "100644",
+                                        "Name": "resnet18.pth",
+                                        "Path": "resnet18.pth",
+                                        "Revision": "09b68012b27de0048ba74003690a890af7aff192",
+                                        "Size": 46827520,
+                                        "Type": "blob"
+                                    }
+            model_file_location (str): The location of the temporary file.
+        Raises:
+            NotImplementedError: _description_
+
+        Returns:
+            str: The location of the cached file.
+        """
+        self.remove_if_exists(model_file_info)  # backup old revision
+        cache_key = self.__get_cache_key(model_file_info)
+        cache_full_path = os.path.join(
+            self.cache_root_location,
+            cache_key['Path'])  # Branch and Tag do not have same name.
+        cache_file_dir = os.path.dirname(cache_full_path)
+        if not os.path.exists(cache_file_dir):
+            os.makedirs(cache_file_dir, exist_ok=True)
+        # We can't make operation transaction
+        move(model_file_location, cache_full_path)
+        self.cached_files.append(cache_key)
+        self.save_cached_files()
+        return cache_full_path
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
new file mode 100644
index 00000000..d0704de8
--- /dev/null
+++ b/modelscope/hub/utils/utils.py
@@ -0,0 +1,39 @@
+import os
+
+from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
+                                      DEFAULT_MODELSCOPE_GITLAB_DOMAIN,
+                                      DEFAULT_MODELSCOPE_GROUP,
+                                      MODEL_ID_SEPARATOR,
+                                      MODELSCOPE_URL_SCHEME)
+
+
+def model_id_to_group_owner_name(model_id):
+    if MODEL_ID_SEPARATOR in model_id:
+        group_or_owner = model_id.split(MODEL_ID_SEPARATOR)[0]
+        name = model_id.split(MODEL_ID_SEPARATOR)[1]
+    else:
+        group_or_owner = DEFAULT_MODELSCOPE_GROUP
+        name = model_id
+    return group_or_owner, name
+
+
+def get_cache_dir():
+    """
+    cache dir precedence:
+        function parameter > enviroment > ~/.cache/modelscope/hub
+    """
+    default_cache_dir = os.path.expanduser(
+        os.path.join('~/.cache', 'modelscope'))
+    return os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir,
+                                                      'hub'))
+
+
+def get_endpoint():
+    modelscope_domain = os.getenv('MODELSCOPE_DOMAIN',
+                                  DEFAULT_MODELSCOPE_DOMAIN)
+    return MODELSCOPE_URL_SCHEME + modelscope_domain
+
+
+def get_gitlab_domain():
+    return os.getenv('MODELSCOPE_GITLAB_DOMAIN',
+                     DEFAULT_MODELSCOPE_GITLAB_DOMAIN)
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
new file mode 100644
index 00000000..f89b7b27
--- /dev/null
+++ b/modelscope/metainfo.py
@@ -0,0 +1,104 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
+class Models(object):
+    """ Names for different models.
+
+        Holds the standard model name to use for identifying different model.
+    This should be used to register models.
+
+        Model name should only contain model info but not task info.
+    """
+    # vision models
+
+    # nlp models
+    bert = 'bert'
+    palm = 'palm-v2'
+    structbert = 'structbert'
+    veco = 'veco'
+
+    # audio models
+    sambert_hifi_16k = 'sambert-hifi-16k'
+    generic_tts_frontend = 'generic-tts-frontend'
+    hifigan16k = 'hifigan16k'
+
+    # multi-modal models
+    ofa = 'ofa'
+
+
+class Pipelines(object):
+    """ Names for different pipelines.
+
+        Holds the standard pipline name to use for identifying different pipeline.
+    This should be used to register pipelines.
+
+        For pipeline which support different models and implements the common function, we
+    should use task name for this pipeline.
+        For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
+    """
+    # vision tasks
+    image_matting = 'unet-image-matting'
+    person_image_cartoon = 'unet-person-image-cartoon'
+    ocr_detection = 'resnet18-ocr-detection'
+    action_recognition = 'TAdaConv_action-recognition'
+
+    # nlp tasks
+    sentence_similarity = 'sentence-similarity'
+    word_segmentation = 'word-segmentation'
+    text_generation = 'text-generation'
+    sentiment_analysis = 'sentiment-analysis'
+    sentiment_classification = 'sentiment-classification'
+    fill_mask = 'fill-mask'
+    nli = 'nli'
+    dialog_intent_prediction = 'dialog-intent-prediction'
+    dialog_modeling = 'dialog-modeling'
+    dialog_state_tracking = 'dialog_state_tracking'
+
+    # audio tasks
+    sambert_hifigan_16k_tts = 'sambert-hifigan-16k-tts'
+    speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
+
+    # multi-modal tasks
+    image_caption = 'image-caption'
+
+
+class Trainers(object):
+    """ Names for different trainer.
+
+        Holds the standard trainer name to use for identifying different trainer.
+    This should be used to register trainers.
+
+        For a general Trainer, you can use easynlp-trainer/ofa-trainer/sofa-trainer.
+        For a model specific Trainer, you can use ${ModelName}-${Task}-trainer.
+    """
+
+    default = 'Trainer'
+
+
+class Preprocessors(object):
+    """ Names for different preprocessor.
+
+        Holds the standard preprocessor name to use for identifying different preprocessor.
+    This should be used to register preprocessors.
+
+        For a general preprocessor, just use the function name as preprocessor name such as
+    resize-image, random-crop
+        For a model-specific preprocessor, use ${modelname}-${fuction}
+    """
+
+    # cv preprocessor
+    load_image = 'load-image'
+
+    # nlp preprocessor
+    bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
+    palm_text_gen_tokenizer = 'palm-text-gen-tokenizer'
+    token_cls_tokenizer = 'token-cls-tokenizer'
+    nli_tokenizer = 'nli-tokenizer'
+    sen_cls_tokenizer = 'sen-cls-tokenizer'
+
+    # audio preprocessor
+    linear_aec_fbank = 'linear-aec-fbank'
+    text_to_tacotron_symbols = 'text-to-tacotron-symbols'
+
+    # multi-modal
+    ofa_image_caption = 'ofa-image-caption'
diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py
index 7d70e6ca..d3423a3f 100644
--- a/modelscope/models/__init__.py
+++ b/modelscope/models/__init__.py
@@ -1,7 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from .audio.tts.am import SambertNetHifi16k
-from .audio.tts.vocoder import Hifigan16k
+# from .audio.tts.am import SambertNetHifi16k
+# from .audio.tts.vocoder import Hifigan16k
 from .base import Model
 from .builder import MODELS, build_model
-from .nlp import BertForSequenceClassification, SbertForSentenceSimilarity
+# from .multi_model import OfaForImageCaptioning
+from .nlp import (BertForSequenceClassification, SbertForNLI,
+                  SbertForSentenceSimilarity, SbertForSentimentClassification,
+                  SbertForTokenClassification, StructBertForMaskedLM,
+                  VecoForMaskedLM)
diff --git a/modelscope/models/audio/tts/am/sambert_hifi_16k.py b/modelscope/models/audio/tts/am/sambert_hifi_16k.py
index 2db9abc6..fc6d519a 100644
--- a/modelscope/models/audio/tts/am/sambert_hifi_16k.py
+++ b/modelscope/models/audio/tts/am/sambert_hifi_16k.py
@@ -6,6 +6,7 @@ import numpy as np
 import tensorflow as tf
 from sklearn.preprocessing import MultiLabelBinarizer
 
+from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import ModelFile, Tasks
@@ -17,7 +18,7 @@ __all__ = ['SambertNetHifi16k']
 
 
 def multi_label_symbol_to_sequence(my_classes, my_symbol):
-    one_hot = MultiLabelBinarizer(my_classes)
+    one_hot = MultiLabelBinarizer(classes=my_classes)
     tokens = my_symbol.strip().split(' ')
     sequences = []
     for token in tokens:
@@ -26,7 +27,8 @@ def multi_label_symbol_to_sequence(my_classes, my_symbol):
     return one_hot.fit_transform(sequences)
 
 
-@MODELS.register_module(Tasks.text_to_speech, module_name=r'sambert_hifi_16k')
+@MODELS.register_module(
+    Tasks.text_to_speech, module_name=Models.sambert_hifi_16k)
 class SambertNetHifi16k(Model):
 
     def __init__(self,
diff --git a/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py b/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
index ed34143f..757e4db9 100644
--- a/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
+++ b/modelscope/models/audio/tts/frontend/generic_text_to_speech_frontend.py
@@ -2,8 +2,7 @@ import os
 import zipfile
 from typing import Any, Dict, List
 
-import ttsfrd
-
+from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import (
@@ -15,11 +14,12 @@ __all__ = ['GenericTtsFrontend']
 
 
 @MODELS.register_module(
-    Tasks.text_to_speech, module_name=r'generic_tts_frontend')
+    Tasks.text_to_speech, module_name=Models.generic_tts_frontend)
 class GenericTtsFrontend(Model):
 
     def __init__(self, model_dir='.', lang_type='pinyin', *args, **kwargs):
         super().__init__(model_dir, *args, **kwargs)
+        import ttsfrd
         frontend = ttsfrd.TtsFrontendEngine()
         zip_file = os.path.join(model_dir, 'resource.zip')
         self._res_path = os.path.join(model_dir, 'resource')
diff --git a/modelscope/models/audio/tts/vocoder/hifigan16k.py b/modelscope/models/audio/tts/vocoder/hifigan16k.py
index 0d917dbe..b3fd9cf6 100644
--- a/modelscope/models/audio/tts/vocoder/hifigan16k.py
+++ b/modelscope/models/audio/tts/vocoder/hifigan16k.py
@@ -10,6 +10,7 @@ import numpy as np
 import torch
 from scipy.io.wavfile import write
 
+from modelscope.metainfo import Models
 from modelscope.models.base import Model
 from modelscope.models.builder import MODELS
 from modelscope.utils.audio.tts_exceptions import \
@@ -36,7 +37,7 @@ class AttrDict(dict):
         self.__dict__ = self
 
 
-@MODELS.register_module(Tasks.text_to_speech, module_name=r'hifigan16k')
+@MODELS.register_module(Tasks.text_to_speech, module_name=Models.hifigan16k)
 class Hifigan16k(Model):
 
     def __init__(self, model_dir, *args, **kwargs):
diff --git a/modelscope/models/audio/tts/vocoder/models/models.py b/modelscope/models/audio/tts/vocoder/models/models.py
index 83fc7dc2..c46a9204 100755
--- a/modelscope/models/audio/tts/vocoder/models/models.py
+++ b/modelscope/models/audio/tts/vocoder/models/models.py
@@ -3,7 +3,6 @@ from distutils.version import LooseVersion
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from pytorch_wavelets import DWT1DForward
 from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 
@@ -357,6 +356,7 @@ class MultiScaleDiscriminator(torch.nn.Module):
             DiscriminatorS(),
             DiscriminatorS(),
         ])
+        from pytorch_wavelets import DWT1DForward
         self.meanpools = nn.ModuleList(
             [DWT1DForward(wave='db3', J=1),
              DWT1DForward(wave='db3', J=1)])
diff --git a/modelscope/models/base.py b/modelscope/models/base.py
index ab0d22cc..cb6d2b0e 100644
--- a/modelscope/models/base.py
+++ b/modelscope/models/base.py
@@ -4,12 +4,13 @@ import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Dict, Union
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
-from modelscope.utils.hub import get_model_cache_dir
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
 
@@ -47,21 +48,25 @@ class Model(ABC):
         if osp.exists(model_name_or_path):
             local_model_dir = model_name_or_path
         else:
-            cache_path = get_model_cache_dir(model_name_or_path)
-            local_model_dir = cache_path if osp.exists(
-                cache_path) else snapshot_download(model_name_or_path)
-            # else:
-            #     raise ValueError(
-            #         'Remote model repo {model_name_or_path} does not exists')
-
+            local_model_dir = snapshot_download(model_name_or_path)
+        logger.info(f'initialize model from {local_model_dir}')
         cfg = Config.from_file(
             osp.join(local_model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
         model_cfg = cfg.model
+        assert hasattr(
+            cfg, 'pipeline'), 'pipeline config is missing from config file.'
+        pipeline_cfg = cfg.pipeline
         # TODO @wenmeng.zwm may should manually initialize model after model building
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
+
         model_cfg.model_dir = local_model_dir
+
         for k, v in kwargs.items():
             model_cfg.k = v
-        return build_model(model_cfg, task_name)
+        model = build_model(model_cfg, task_name)
+
+        # dynamically add pipeline info to model for pipeline inference
+        model.pipeline = pipeline_cfg
+        return model
diff --git a/modelscope/models/cv/action_recognition/__init__.py b/modelscope/models/cv/action_recognition/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/action_recognition/models.py b/modelscope/models/cv/action_recognition/models.py
new file mode 100644
index 00000000..e85b6d81
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/models.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+
+from .tada_convnext import TadaConvNeXt
+
+
+class BaseVideoModel(nn.Module):
+    """
+    Standard video model.
+    The model is divided into the backbone and the head, where the backbone
+    extracts features and the head performs classification.
+
+    The backbones can be defined in model/base/backbone.py or anywhere else
+    as long as the backbone is registered by the BACKBONE_REGISTRY.
+    The heads can be defined in model/module_zoo/heads/ or anywhere else
+    as long as the head is registered by the HEAD_REGISTRY.
+
+    The registries automatically finds the registered modules and construct
+    the base video model.
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(BaseVideoModel, self).__init__()
+        # the backbone is created according to meta-architectures
+        # defined in models/base/backbone.py
+        self.backbone = TadaConvNeXt(cfg)
+
+        # the head is created according to the heads
+        # defined in models/module_zoo/heads
+        self.head = BaseHead(cfg)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        x = self.head(x)
+        return x
+
+
+class BaseHead(nn.Module):
+    """
+    Constructs base head.
+    """
+
+    def __init__(
+        self,
+        cfg,
+    ):
+        """
+        Args:
+            cfg (Config): global config object.
+        """
+        super(BaseHead, self).__init__()
+        self.cfg = cfg
+        dim = cfg.VIDEO.BACKBONE.NUM_OUT_FEATURES
+        num_classes = cfg.VIDEO.HEAD.NUM_CLASSES
+        dropout_rate = cfg.VIDEO.HEAD.DROPOUT_RATE
+        activation_func = cfg.VIDEO.HEAD.ACTIVATION
+        self._construct_head(dim, num_classes, dropout_rate, activation_func)
+
+    def _construct_head(self, dim, num_classes, dropout_rate, activation_func):
+        self.global_avg_pool = nn.AdaptiveAvgPool3d(1)
+
+        if dropout_rate > 0.0:
+            self.dropout = nn.Dropout(dropout_rate)
+
+        self.out = nn.Linear(dim, num_classes, bias=True)
+
+        if activation_func == 'softmax':
+            self.activation = nn.Softmax(dim=-1)
+        elif activation_func == 'sigmoid':
+            self.activation = nn.Sigmoid()
+        else:
+            raise NotImplementedError('{} is not supported as an activation'
+                                      'function.'.format(activation_func))
+
+    def forward(self, x):
+        if len(x.shape) == 5:
+            x = self.global_avg_pool(x)
+            # (N, C, T, H, W) -> (N, T, H, W, C).
+            x = x.permute((0, 2, 3, 4, 1))
+        if hasattr(self, 'dropout'):
+            out = self.dropout(x)
+        else:
+            out = x
+        out = self.out(out)
+        out = self.activation(out)
+        out = out.view(out.shape[0], -1)
+        return out, x.view(x.shape[0], -1)
diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py
new file mode 100644
index 00000000..379b5271
--- /dev/null
+++ b/modelscope/models/cv/action_recognition/tada_convnext.py
@@ -0,0 +1,472 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair, _triple
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """
+    From https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py.
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """
+    From https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py.
+    Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class TadaConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(
+        self, cfg
+        #  in_chans=3, num_classes=1000,
+        #  depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], drop_path_rate=0.,
+        #  layer_scale_init_value=1e-6, head_init_scale=1.,
+    ):
+        super().__init__()
+        in_chans = cfg.VIDEO.BACKBONE.NUM_INPUT_CHANNELS
+        dims = cfg.VIDEO.BACKBONE.NUM_FILTERS
+        drop_path_rate = cfg.VIDEO.BACKBONE.DROP_PATH
+        depths = cfg.VIDEO.BACKBONE.DEPTH
+        layer_scale_init_value = cfg.VIDEO.BACKBONE.LARGE_SCALE_INIT_VALUE
+        stem_t_kernel_size = cfg.VIDEO.BACKBONE.STEM.T_KERNEL_SIZE if hasattr(
+            cfg.VIDEO.BACKBONE.STEM, 'T_KERNEL_SIZE') else 2
+        t_stride = cfg.VIDEO.BACKBONE.STEM.T_STRIDE if hasattr(
+            cfg.VIDEO.BACKBONE.STEM, 'T_STRIDE') else 2
+
+        self.downsample_layers = nn.ModuleList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv3d(
+                in_chans,
+                dims[0],
+                kernel_size=(stem_t_kernel_size, 4, 4),
+                stride=(t_stride, 4, 4),
+                padding=((stem_t_kernel_size - 1) // 2, 0, 0)),
+            LayerNorm(dims[0], eps=1e-6, data_format='channels_first'))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format='channels_first'),
+                nn.Conv3d(
+                    dims[i],
+                    dims[i + 1],
+                    kernel_size=(1, 2, 2),
+                    stride=(1, 2, 2)),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(*[
+                TAdaConvNeXtBlock(
+                    cfg,
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean(
+            [-3, -2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)
+
+    def forward(self, x):
+        if isinstance(x, dict):
+            x = x['video']
+        x = self.forward_features(x)
+        return x
+
+    def get_num_layers(self):
+        return 12, 0
+
+
+class ConvNeXtBlock(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, cfg, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv3d(
+            dim, dim, kernel_size=(1, 7, 7), padding=(0, 3, 3),
+            groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim,
+            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(
+            layer_scale_init_value * torch.ones((dim)),
+            requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 4, 1)  # (N, C, T, H, W) -> (N, T, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 4, 1, 2, 3)  # (N, T, H, W, C) -> (N, C, T, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 eps=1e-6,
+                 data_format='channels_last'):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ['channels_last', 'channels_first']:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == 'channels_last':
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == 'channels_first':
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None, None] * x + self.bias[:, None, None,
+                                                                 None]
+            return x
+
+
+class TAdaConvNeXtBlock(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_fi rst) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, cfg, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        layer_scale_init_value = float(layer_scale_init_value)
+        self.dwconv = TAdaConv2d(
+            dim,
+            dim,
+            kernel_size=(1, 7, 7),
+            padding=(0, 3, 3),
+            groups=dim,
+            cal_dim='cout')
+        route_func_type = cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_TYPE
+        if route_func_type == 'normal':
+            self.dwconv_rf = RouteFuncMLP(
+                c_in=dim,
+                ratio=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R,
+                kernels=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K,
+                with_bias_cal=self.dwconv.bias is not None)
+        elif route_func_type == 'normal_lngelu':
+            self.dwconv_rf = RouteFuncMLPLnGelu(
+                c_in=dim,
+                ratio=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R,
+                kernels=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K,
+                with_bias_cal=self.dwconv.bias is not None)
+        else:
+            raise ValueError(
+                'Unknown route_func_type: {}'.format(route_func_type))
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim,
+            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(
+            layer_scale_init_value * torch.ones((dim)),
+            requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x, self.dwconv_rf(x))
+        x = x.permute(0, 2, 3, 4, 1)  # (N, C, T, H, W) -> (N, T, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 4, 1, 2, 3)  # (N, T, H, W, C) -> (N, C, T, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class RouteFuncMLPLnGelu(nn.Module):
+    """
+    The routing function for generating the calibration weights.
+    """
+
+    def __init__(self,
+                 c_in,
+                 ratio,
+                 kernels,
+                 with_bias_cal=False,
+                 bn_eps=1e-5,
+                 bn_mmt=0.1):
+        """
+        Args:
+            c_in (int): number of input channels.
+            ratio (int): reduction ratio for the routing function.
+            kernels (list): temporal kernel size of the stacked 1D convolutions
+        """
+        super(RouteFuncMLPLnGelu, self).__init__()
+        self.c_in = c_in
+        self.with_bias_cal = with_bias_cal
+        self.avgpool = nn.AdaptiveAvgPool3d((None, 1, 1))
+        self.globalpool = nn.AdaptiveAvgPool3d(1)
+        self.g = nn.Conv3d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=1,
+            padding=0,
+        )
+        self.a = nn.Conv3d(
+            in_channels=c_in,
+            out_channels=int(c_in // ratio),
+            kernel_size=[kernels[0], 1, 1],
+            padding=[kernels[0] // 2, 0, 0],
+        )
+        # self.bn = nn.BatchNorm3d(int(c_in//ratio), eps=bn_eps, momentum=bn_mmt)
+        self.ln = LayerNorm(
+            int(c_in // ratio), eps=1e-6, data_format='channels_first')
+        self.gelu = nn.GELU()
+        # self.relu = nn.ReLU(inplace=True)
+        self.b = nn.Conv3d(
+            in_channels=int(c_in // ratio),
+            out_channels=c_in,
+            kernel_size=[kernels[1], 1, 1],
+            padding=[kernels[1] // 2, 0, 0],
+            bias=False)
+        self.b.skip_init = True
+        self.b.weight.data.zero_()  # to make sure the initial values
+        # for the output is 1.
+        if with_bias_cal:
+            self.b_bias = nn.Conv3d(
+                in_channels=int(c_in // ratio),
+                out_channels=c_in,
+                kernel_size=[kernels[1], 1, 1],
+                padding=[kernels[1] // 2, 0, 0],
+                bias=False)
+            self.b_bias.skip_init = True
+            self.b_bias.weight.data.zero_()  # to make sure the initial values
+            # for the output is 1.
+
+    def forward(self, x):
+        g = self.globalpool(x)
+        x = self.avgpool(x)
+        x = self.a(x + self.g(g))
+        # x = self.bn(x)
+        # x = self.relu(x)
+        x = self.ln(x)
+        x = self.gelu(x)
+        if self.with_bias_cal:
+            return [self.b(x) + 1, self.b_bias(x) + 1]
+        else:
+            return self.b(x) + 1
+
+
+class TAdaConv2d(nn.Module):
+    """
+    Performs temporally adaptive 2D convolution.
+    Currently, only application on 5D tensors is supported, which makes TAdaConv2d
+        essentially a 3D convolution with temporal kernel size of 1.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 cal_dim='cin'):
+        super(TAdaConv2d, self).__init__()
+        """
+        Args:
+            in_channels (int): number of input channels.
+            out_channels (int): number of output channels.
+            kernel_size (list): kernel size of TAdaConv2d.
+            stride (list): stride for the convolution in TAdaConv2d.
+             padding (list): padding for the convolution in TAdaConv2d.
+            dilation (list): dilation of the convolution in TAdaConv2d.
+            groups (int): number of groups for TAdaConv2d.
+            bias (bool): whether to use bias in TAdaConv2d.
+            calibration_mode (str): calibrated dimension in TAdaConv2d.
+                Supported input "cin", "cout".
+        """
+
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+
+        assert kernel_size[0] == 1
+        assert stride[0] == 1
+        assert padding[0] == 0
+        assert dilation[0] == 1
+        assert cal_dim in ['cin', 'cout']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.cal_dim = cal_dim
+
+        # base weights (W_b)
+        self.weight = nn.Parameter(
+            torch.Tensor(1, 1, out_channels, in_channels // groups,
+                         kernel_size[1], kernel_size[2]))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(1, 1, out_channels))
+        else:
+            self.register_parameter('bias', None)
+
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            nn.init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x, alpha):
+        """
+        Args:
+            x (tensor): feature to perform convolution on.
+            alpha (tensor): calibration weight for the base weights.
+                W_t = alpha_t * W_b
+        """
+        if isinstance(alpha, list):
+            w_alpha, b_alpha = alpha[0], alpha[1]
+        else:
+            w_alpha = alpha
+            b_alpha = None
+        _, _, c_out, c_in, kh, kw = self.weight.size()
+        b, c_in, t, h, w = x.size()
+        x = x.permute(0, 2, 1, 3, 4).reshape(1, -1, h, w)
+
+        if self.cal_dim == 'cin':
+            # w_alpha: B, C, T, H(1), W(1) -> B, T, C, H(1), W(1) -> B, T, 1, C, H(1), W(1)
+            # corresponding to calibrating the input channel
+            weight = (w_alpha.permute(0, 2, 1, 3, 4).unsqueeze(2)
+                      * self.weight).reshape(-1, c_in // self.groups, kh, kw)
+        elif self.cal_dim == 'cout':
+            # w_alpha: B, C, T, H(1), W(1) -> B, T, C, H(1), W(1) -> B, T, C, 1, H(1), W(1)
+            # corresponding to calibrating the input channel
+            weight = (w_alpha.permute(0, 2, 1, 3, 4).unsqueeze(3)
+                      * self.weight).reshape(-1, c_in // self.groups, kh, kw)
+
+        bias = None
+        if self.bias is not None:
+            if b_alpha is not None:
+                # b_alpha: B, C, T, H(1), W(1) -> B, T, C, H(1), W(1) -> B, T, C
+                bias = (b_alpha.permute(0, 2, 1, 3, 4).squeeze()
+                        * self.bias).reshape(-1)
+            else:
+                bias = self.bias.repeat(b, t, 1).reshape(-1)
+        output = F.conv2d(
+            x,
+            weight=weight,
+            bias=bias,
+            stride=self.stride[1:],
+            padding=self.padding[1:],
+            dilation=self.dilation[1:],
+            groups=self.groups * b * t)
+
+        output = output.view(b, t, c_out, output.size(-2),
+                             output.size(-1)).permute(0, 2, 1, 3, 4)
+
+        return output
+
+    def __repr__(self):
+        return f'TAdaConv2d({self.in_channels}, {self.out_channels}, kernel_size={self.kernel_size}, ' +\
+            f"stride={self.stride}, padding={self.padding}, bias={self.bias is not None}, cal_dim=\"{self.cal_dim}\")"
diff --git a/modelscope/models/multi_model/__init__.py b/modelscope/models/multi_model/__init__.py
new file mode 100644
index 00000000..02e8d6ab
--- /dev/null
+++ b/modelscope/models/multi_model/__init__.py
@@ -0,0 +1 @@
+from .image_captioning_model import OfaForImageCaptioning
diff --git a/modelscope/models/multi_model/image_captioning_model.py b/modelscope/models/multi_model/image_captioning_model.py
new file mode 100644
index 00000000..79ab2b5f
--- /dev/null
+++ b/modelscope/models/multi_model/image_captioning_model.py
@@ -0,0 +1,80 @@
+import os.path as osp
+from typing import Any, Dict
+
+from PIL import Image
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import ModelFile, Tasks
+from ..base import Model
+from ..builder import MODELS
+
+__all__ = ['OfaForImageCaptioning']
+
+
+@MODELS.register_module(Tasks.image_captioning, module_name=Models.ofa)
+class OfaForImageCaptioning(Model):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir=model_dir, *args, **kwargs)
+        ckpt_name = ModelFile.TORCH_MODEL_FILE
+        local_model = osp.join(model_dir, ckpt_name)
+        bpe_dir = model_dir
+        # turn on cuda if GPU is available
+        from fairseq import checkpoint_utils, tasks, utils
+        from ofa.tasks.mm_tasks import CaptionTask
+        from ofa.utils.eval_utils import eval_caption
+        self.eval_caption = eval_caption
+
+        tasks.register_task('caption', CaptionTask)
+        use_cuda = kwargs['use_cuda'] if 'use_cuda' in kwargs else False
+        use_fp16 = kwargs[
+            'use_fp16'] if 'use_fp16' in kwargs and use_cuda else False
+        overrides = {
+            'bpe_dir': bpe_dir,
+            'eval_cider': False,
+            'beam': 5,
+            'max_len_b': 16,
+            'no_repeat_ngram_size': 3,
+            'seed': 7
+        }
+        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+            utils.split_paths(local_model), arg_overrides=overrides)
+
+        # Move models to GPU
+        for model in models:
+            model.eval()
+            if use_cuda:
+                model.cuda()
+            if use_fp16:
+                model.half()
+            model.prepare_for_inference_(cfg)
+        self.models = models
+        # Initialize generator
+        self.generator = task.build_generator(models, cfg.generation)
+
+        # Initialize transform
+        from torchvision import transforms
+        mean = [0.5, 0.5, 0.5]
+        std = [0.5, 0.5, 0.5]
+
+        self.patch_resize_transform = transforms.Compose([
+            lambda image: image.convert('RGB'),
+            transforms.Resize(
+                (cfg.task.patch_image_size, cfg.task.patch_image_size),
+                interpolation=Image.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.task = task
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results, _ = self.eval_caption(self.task, self.generator, self.models,
+                                       input)
+        return {
+            'image_id': results[0]['image_id'],
+            'caption': results[0]['caption']
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        # What should we do here ?
+        return inputs
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index e62ab404..49cbd053 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -1,6 +1,9 @@
 from .bert_for_sequence_classification import *  # noqa F403
+from .masked_language_model import *  # noqa F403
 from .palm_for_text_generation import *  # noqa F403
+from .sbert_for_nli import *  # noqa F403
 from .sbert_for_sentence_similarity import *  # noqa F403
+from .sbert_for_sentiment_classification import *  # noqa F403
 from .sbert_for_token_classification import *  # noqa F403
 from .space.dialog_intent_prediction_model import *  # noqa F403
 from .space.dialog_modeling_model import *  # noqa F403
diff --git a/modelscope/models/nlp/bert_for_sequence_classification.py b/modelscope/models/nlp/bert_for_sequence_classification.py
index a3cc4b68..7d85fa28 100644
--- a/modelscope/models/nlp/bert_for_sequence_classification.py
+++ b/modelscope/models/nlp/bert_for_sequence_classification.py
@@ -4,6 +4,7 @@ from typing import Any, Dict
 import json
 import numpy as np
 
+from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model
 from ..builder import MODELS
@@ -11,8 +12,7 @@ from ..builder import MODELS
 __all__ = ['BertForSequenceClassification']
 
 
-@MODELS.register_module(
-    Tasks.text_classification, module_name=r'bert-sentiment-analysis')
+@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
 class BertForSequenceClassification(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/nlp/masked_language_model.py b/modelscope/models/nlp/masked_language_model.py
new file mode 100644
index 00000000..bb255c9c
--- /dev/null
+++ b/modelscope/models/nlp/masked_language_model.py
@@ -0,0 +1,63 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from ...metainfo import Models
+from ...utils.constant import Tasks
+from ..base import Model, Tensor
+from ..builder import MODELS
+
+__all__ = ['StructBertForMaskedLM', 'VecoForMaskedLM']
+
+
+class MaskedLanguageModelBase(Model):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.model = self.build_model()
+
+    def build_model(self):
+        raise NotImplementedError()
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    @property
+    def config(self):
+        if hasattr(self.model, 'config'):
+            return self.model.config
+        return None
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, np.ndarray]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Any]): the preprocessed data
+
+        Returns:
+            Dict[str, np.ndarray]: results
+        """
+        rst = self.model(
+            input_ids=input['input_ids'],
+            attention_mask=input['attention_mask'],
+            token_type_ids=input['token_type_ids'])
+        return {'logits': rst['logits'], 'input_ids': input['input_ids']}
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
+class StructBertForMaskedLM(MaskedLanguageModelBase):
+
+    def build_model(self):
+        from sofa import SbertForMaskedLM
+        return SbertForMaskedLM.from_pretrained(self.model_dir)
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
+class VecoForMaskedLM(MaskedLanguageModelBase):
+
+    def build_model(self):
+        from sofa import VecoForMaskedLM
+        return VecoForMaskedLM.from_pretrained(self.model_dir)
diff --git a/modelscope/models/nlp/palm_for_text_generation.py b/modelscope/models/nlp/palm_for_text_generation.py
index e5799feb..f6c15387 100644
--- a/modelscope/models/nlp/palm_for_text_generation.py
+++ b/modelscope/models/nlp/palm_for_text_generation.py
@@ -1,13 +1,14 @@
 from typing import Dict
 
-from modelscope.utils.constant import Tasks
+from ...metainfo import Models
+from ...utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
 
 __all__ = ['PalmForTextGeneration']
 
 
-@MODELS.register_module(Tasks.text_generation, module_name=r'palm2.0')
+@MODELS.register_module(Tasks.text_generation, module_name=Models.palm)
 class PalmForTextGeneration(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -19,13 +20,18 @@ class PalmForTextGeneration(Model):
                 default loader to load model weights, by default None.
         """
         super().__init__(model_dir, *args, **kwargs)
-        self.model_dir = model_dir
 
         from sofa.models.palm_v2 import PalmForConditionalGeneration, Translator
         model = PalmForConditionalGeneration.from_pretrained(model_dir)
         self.tokenizer = model.tokenizer
         self.generator = Translator(model)
 
+    def train(self):
+        return self.generator.train()
+
+    def eval(self):
+        return self.generator.eval()
+
     def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         """return the result by the model
 
diff --git a/modelscope/models/nlp/sbert_for_nli.py b/modelscope/models/nlp/sbert_for_nli.py
new file mode 100644
index 00000000..a5a76b34
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_nli.py
@@ -0,0 +1,23 @@
+from ...metainfo import Models
+from ...utils.constant import Tasks
+from ..builder import MODELS
+from .sbert_for_sequence_classification import \
+    SbertForSequenceClassificationBase
+
+__all__ = ['SbertForNLI']
+
+
+@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
+class SbertForNLI(SbertForSequenceClassificationBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(
+            model_dir, *args, model_args={'num_labels': 3}, **kwargs)
+        assert self.model.config.num_labels == 3
diff --git a/modelscope/models/nlp/sbert_for_sentence_similarity.py b/modelscope/models/nlp/sbert_for_sentence_similarity.py
index 98daac92..25c38a2e 100644
--- a/modelscope/models/nlp/sbert_for_sentence_similarity.py
+++ b/modelscope/models/nlp/sbert_for_sentence_similarity.py
@@ -1,46 +1,15 @@
-import os
-from typing import Any, Dict
-
-import json
-import numpy as np
-import torch
-from sofa import SbertModel
-from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel
-from torch import nn
-
+from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
-from ..base import Model, Tensor
 from ..builder import MODELS
+from .sbert_for_sequence_classification import \
+    SbertForSequenceClassificationBase
 
 __all__ = ['SbertForSentenceSimilarity']
 
 
-class SbertTextClassifier(SbertPreTrainedModel):
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.encoder = SbertModel(config, add_pooling_layer=True)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(self, input_ids=None, token_type_ids=None):
-        outputs = self.encoder(
-            input_ids,
-            token_type_ids=token_type_ids,
-            return_dict=None,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        return logits
-
-
 @MODELS.register_module(
-    Tasks.sentence_similarity,
-    module_name=r'sbert-base-chinese-sentence-similarity')
-class SbertForSentenceSimilarity(Model):
+    Tasks.sentence_similarity, module_name=Models.structbert)
+class SbertForSentenceSimilarity(SbertForSequenceClassificationBase):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the sentence similarity model from the `model_dir` path.
@@ -50,39 +19,7 @@ class SbertForSentenceSimilarity(Model):
             model_cls (Optional[Any], optional): model loader, if None, use the
                 default loader to load model weights, by default None.
         """
-        super().__init__(model_dir, *args, **kwargs)
+        super().__init__(
+            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
         self.model_dir = model_dir
-
-        self.model = SbertTextClassifier.from_pretrained(
-            model_dir, num_labels=2)
-        self.model.eval()
-        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
-        with open(self.label_path) as f:
-            self.label_mapping = json.load(f)
-        self.id2label = {idx: name for name, idx in self.label_mapping.items()}
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
-        """return the result by the model
-
-        Args:
-            input (Dict[str, Any]): the preprocessed data
-
-        Returns:
-            Dict[str, np.ndarray]: results
-                Example:
-                    {
-                        'predictions': array([1]), # lable 0-negative 1-positive
-                        'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
-                        'logits': array([[-0.53860897,  1.5029076 ]], dtype=float32) # true value
-                    }
-        """
-        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
-        token_type_ids = torch.tensor(
-            input['token_type_ids'], dtype=torch.long)
-        with torch.no_grad():
-            logits = self.model(input_ids, token_type_ids)
-        probs = logits.softmax(-1).numpy()
-        pred = logits.argmax(-1).numpy()
-        logits = logits.numpy()
-        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
-        return res
+        assert self.model.config.num_labels == 2
diff --git a/modelscope/models/nlp/sbert_for_sentiment_classification.py b/modelscope/models/nlp/sbert_for_sentiment_classification.py
new file mode 100644
index 00000000..72fb92f0
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_sentiment_classification.py
@@ -0,0 +1,24 @@
+from modelscope.metainfo import Models
+from modelscope.utils.constant import Tasks
+from ..builder import MODELS
+from .sbert_for_sequence_classification import \
+    SbertForSequenceClassificationBase
+
+__all__ = ['SbertForSentimentClassification']
+
+
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.structbert)
+class SbertForSentimentClassification(SbertForSequenceClassificationBase):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the text generation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            model_cls (Optional[Any], optional): model loader, if None, use the
+                default loader to load model weights, by default None.
+        """
+        super().__init__(
+            model_dir, *args, model_args={'num_labels': 2}, **kwargs)
+        assert self.model.config.num_labels == 2
diff --git a/modelscope/models/nlp/sbert_for_sequence_classification.py b/modelscope/models/nlp/sbert_for_sequence_classification.py
new file mode 100644
index 00000000..861b6fe2
--- /dev/null
+++ b/modelscope/models/nlp/sbert_for_sequence_classification.py
@@ -0,0 +1,71 @@
+import os
+from typing import Any, Dict
+
+import json
+import numpy as np
+import torch
+from sofa.models.sbert.modeling_sbert import SbertModel, SbertPreTrainedModel
+from torch import nn
+
+from ..base import Model
+
+
+class SbertTextClassfier(SbertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.encoder = SbertModel(config, add_pooling_layer=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids=None, token_type_ids=None):
+        outputs = self.encoder(
+            input_ids,
+            token_type_ids=token_type_ids,
+            return_dict=None,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return {'logits': logits}
+
+
+class SbertForSequenceClassificationBase(Model):
+
+    def __init__(self, model_dir: str, model_args=None, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        if model_args is None:
+            model_args = {}
+        self.model = SbertTextClassfier.from_pretrained(
+            model_dir, **model_args)
+        self.id2label = {}
+        self.label_path = os.path.join(self.model_dir, 'label_mapping.json')
+        if os.path.exists(self.label_path):
+            with open(self.label_path) as f:
+                self.label_mapping = json.load(f)
+            self.id2label = {
+                idx: name
+                for name, idx in self.label_mapping.items()
+            }
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
+        input_ids = torch.tensor(input['input_ids'], dtype=torch.long)
+        token_type_ids = torch.tensor(
+            input['token_type_ids'], dtype=torch.long)
+        return self.model.forward(input_ids, token_type_ids)
+
+    def postprocess(self, input, **kwargs):
+        logits = input['logits']
+        probs = logits.softmax(-1).numpy()
+        pred = logits.argmax(-1).numpy()
+        logits = logits.numpy()
+        res = {'predictions': pred, 'probabilities': probs, 'logits': logits}
+        return res
diff --git a/modelscope/models/nlp/sbert_for_token_classification.py b/modelscope/models/nlp/sbert_for_token_classification.py
index b918dc37..fd175033 100644
--- a/modelscope/models/nlp/sbert_for_token_classification.py
+++ b/modelscope/models/nlp/sbert_for_token_classification.py
@@ -2,19 +2,17 @@ from typing import Any, Dict, Union
 
 import numpy as np
 import torch
-from sofa import SbertConfig, SbertForTokenClassification
 
+from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from ..base import Model, Tensor
 from ..builder import MODELS
 
-__all__ = ['StructBertForTokenClassification']
+__all__ = ['SbertForTokenClassification']
 
 
-@MODELS.register_module(
-    Tasks.word_segmentation,
-    module_name=r'structbert-chinese-word-segmentation')
-class StructBertForTokenClassification(Model):
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.structbert)
+class SbertForTokenClassification(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
         """initialize the word segmentation model from the `model_dir` path.
@@ -26,9 +24,16 @@ class StructBertForTokenClassification(Model):
         """
         super().__init__(model_dir, *args, **kwargs)
         self.model_dir = model_dir
-        self.model = SbertForTokenClassification.from_pretrained(
+        import sofa
+        self.model = sofa.SbertForTokenClassification.from_pretrained(
             self.model_dir)
-        self.config = SbertConfig.from_pretrained(self.model_dir)
+        self.config = sofa.SbertConfig.from_pretrained(self.model_dir)
+
+    def train(self):
+        return self.model.train()
+
+    def eval(self):
+        return self.model.eval()
 
     def forward(self, input: Dict[str,
                                   Any]) -> Dict[str, Union[str, np.ndarray]]:
@@ -47,10 +52,12 @@ class StructBertForTokenClassification(Model):
                     }
         """
         input_ids = torch.tensor(input['input_ids']).unsqueeze(0)
-        output = self.model(input_ids)
-        logits = output.logits
+        return {**self.model(input_ids), 'text': input['text']}
+
+    def postprocess(self, input: Dict[str, Tensor],
+                    **kwargs) -> Dict[str, Tensor]:
+        logits = input['logits']
         pred = torch.argmax(logits[0], dim=-1)
         pred = pred.numpy()
-
         rst = {'predictions': pred, 'logits': logits, 'text': input['text']}
         return rst
diff --git a/modelscope/models/nlp/space/dialog_intent_prediction_model.py b/modelscope/models/nlp/space/dialog_intent_prediction_model.py
index 3ea500e5..a5d94376 100644
--- a/modelscope/models/nlp/space/dialog_intent_prediction_model.py
+++ b/modelscope/models/nlp/space/dialog_intent_prediction_model.py
@@ -1,11 +1,10 @@
 import os
 from typing import Any, Dict
 
-from modelscope.preprocessors.space.fields.intent_field import \
-    IntentBPETextField
-from modelscope.trainers.nlp.space.trainers.intent_trainer import IntentTrainer
-from modelscope.utils.config import Config
-from modelscope.utils.constant import Tasks
+from ....preprocessors.space.fields.intent_field import IntentBPETextField
+from ....trainers.nlp.space.trainers.intent_trainer import IntentTrainer
+from ....utils.config import Config
+from ....utils.constant import Tasks
 from ...base import Model, Tensor
 from ...builder import MODELS
 from .model.generator import Generator
@@ -14,8 +13,7 @@ from .model.model_base import ModelBase
 __all__ = ['DialogIntentModel']
 
 
-@MODELS.register_module(
-    Tasks.dialog_intent_prediction, module_name=r'space-intent')
+@MODELS.register_module(Tasks.dialog_intent_prediction, module_name=r'space')
 class DialogIntentModel(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/nlp/space/dialog_modeling_model.py b/modelscope/models/nlp/space/dialog_modeling_model.py
index bae8a822..4a34f132 100644
--- a/modelscope/models/nlp/space/dialog_modeling_model.py
+++ b/modelscope/models/nlp/space/dialog_modeling_model.py
@@ -1,11 +1,10 @@
 import os
 from typing import Any, Dict, Optional
 
-from modelscope.preprocessors.space.fields.gen_field import \
-    MultiWOZBPETextField
-from modelscope.trainers.nlp.space.trainers.gen_trainer import MultiWOZTrainer
-from modelscope.utils.config import Config
-from modelscope.utils.constant import Tasks
+from ....preprocessors.space.fields.gen_field import MultiWOZBPETextField
+from ....trainers.nlp.space.trainers.gen_trainer import MultiWOZTrainer
+from ....utils.config import Config
+from ....utils.constant import Tasks
 from ...base import Model, Tensor
 from ...builder import MODELS
 from .model.generator import Generator
@@ -14,7 +13,7 @@ from .model.model_base import ModelBase
 __all__ = ['DialogModelingModel']
 
 
-@MODELS.register_module(Tasks.dialog_modeling, module_name=r'space-modeling')
+@MODELS.register_module(Tasks.dialog_modeling, module_name=r'space')
 class DialogModelingModel(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/nlp/space/dialog_state_tracking.py b/modelscope/models/nlp/space/dialog_state_tracking.py
index 4b1c44d3..e94c59b0 100644
--- a/modelscope/models/nlp/space/dialog_state_tracking.py
+++ b/modelscope/models/nlp/space/dialog_state_tracking.py
@@ -11,7 +11,7 @@ from .model.model_base import ModelBase
 __all__ = ['DialogStateTrackingModel']
 
 
-@MODELS.register_module(Tasks.dialog_state_tracking, module_name=r'space-dst')
+@MODELS.register_module(Tasks.dialog_state_tracking, module_name=r'space')
 class DialogStateTrackingModel(Model):
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/nlp/space/model/gen_unified_transformer.py b/modelscope/models/nlp/space/model/gen_unified_transformer.py
index c076cce4..0f1b1a83 100644
--- a/modelscope/models/nlp/space/model/gen_unified_transformer.py
+++ b/modelscope/models/nlp/space/model/gen_unified_transformer.py
@@ -3,8 +3,7 @@ IntentUnifiedTransformer
 """
 import torch
 
-from modelscope.models.nlp.space.model.unified_transformer import \
-    UnifiedTransformer
+from .unified_transformer import UnifiedTransformer
 
 
 class GenUnifiedTransformer(UnifiedTransformer):
diff --git a/modelscope/models/nlp/space/model/intent_unified_transformer.py b/modelscope/models/nlp/space/model/intent_unified_transformer.py
index 646a8044..b9c699d7 100644
--- a/modelscope/models/nlp/space/model/intent_unified_transformer.py
+++ b/modelscope/models/nlp/space/model/intent_unified_transformer.py
@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modelscope.utils.nlp.space.criterions import compute_kl_loss
+from .....utils.nlp.space.criterions import compute_kl_loss
 from .unified_transformer import UnifiedTransformer
 
 
diff --git a/modelscope/models/nlp/space/model/unified_transformer.py b/modelscope/models/nlp/space/model/unified_transformer.py
index a25bc7f4..2636553d 100644
--- a/modelscope/models/nlp/space/model/unified_transformer.py
+++ b/modelscope/models/nlp/space/model/unified_transformer.py
@@ -7,10 +7,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modelscope.models.nlp.space.model.model_base import ModelBase
-from modelscope.models.nlp.space.modules.embedder import Embedder
-from modelscope.models.nlp.space.modules.transformer_block import \
-    TransformerBlock
+from ..modules.embedder import Embedder
+from ..modules.transformer_block import TransformerBlock
+from .model_base import ModelBase
 
 
 class UnifiedTransformer(ModelBase):
diff --git a/modelscope/models/nlp/space/modules/transformer_block.py b/modelscope/models/nlp/space/modules/transformer_block.py
index 1a0565d6..5b6c79a5 100644
--- a/modelscope/models/nlp/space/modules/transformer_block.py
+++ b/modelscope/models/nlp/space/modules/transformer_block.py
@@ -5,9 +5,8 @@ TransformerBlock class.
 import torch
 import torch.nn as nn
 
-from modelscope.models.nlp.space.modules.feedforward import FeedForward
-from modelscope.models.nlp.space.modules.multihead_attention import \
-    MultiheadAttention
+from .feedforward import FeedForward
+from .multihead_attention import MultiheadAttention
 
 
 class TransformerBlock(nn.Module):
diff --git a/modelscope/pipelines/__init__.py b/modelscope/pipelines/__init__.py
index 6e2645de..962b2245 100644
--- a/modelscope/pipelines/__init__.py
+++ b/modelscope/pipelines/__init__.py
@@ -1,7 +1,4 @@
-from .audio import LinearAECPipeline
+# from .audio import LinearAECPipeline
 from .base import Pipeline
 from .builder import pipeline
-from .cv import *  # noqa F403
-from .multi_modal import *  # noqa F403
 from .nlp import *  # noqa F403
-from .nlp.space import *  # noqa F403
diff --git a/modelscope/pipelines/audio/linear_aec_pipeline.py b/modelscope/pipelines/audio/linear_aec_pipeline.py
index 528d8d47..70562b19 100644
--- a/modelscope/pipelines/audio/linear_aec_pipeline.py
+++ b/modelscope/pipelines/audio/linear_aec_pipeline.py
@@ -7,6 +7,7 @@ import scipy.io.wavfile as wav
 import torch
 import yaml
 
+from modelscope.metainfo import Pipelines
 from modelscope.preprocessors.audio import LinearAECAndFbank
 from modelscope.utils.constant import ModelFile, Tasks
 from ..base import Pipeline
@@ -39,7 +40,8 @@ def initialize_config(module_cfg):
 
 
 @PIPELINES.register_module(
-    Tasks.speech_signal_process, module_name=r'speech_dfsmn_aec_psm_16k')
+    Tasks.speech_signal_process,
+    module_name=Pipelines.speech_dfsmn_aec_psm_16k)
 class LinearAECPipeline(Pipeline):
     r"""AEC Inference Pipeline only support 16000 sample rate.
 
diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py
index ecd9daac..22586d3e 100644
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -3,6 +3,7 @@ from typing import Any, Dict, List
 
 import numpy as np
 
+from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.audio.tts.am import SambertNetHifi16k
 from modelscope.models.audio.tts.vocoder import Hifigan16k
@@ -15,7 +16,7 @@ __all__ = ['TextToSpeechSambertHifigan16kPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.text_to_speech, module_name=r'tts-sambert-hifigan-16k')
+    Tasks.text_to_speech, module_name=Pipelines.sambert_hifigan_16k_tts)
 class TextToSpeechSambertHifigan16kPipeline(Pipeline):
 
     def __init__(self,
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index e266d21c..7e32f543 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -4,19 +4,17 @@ import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Generator, List, Union
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.base import Model
 from modelscope.preprocessors import Preprocessor
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.config import Config
-from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.logger import get_logger
 from .outputs import TASK_OUTPUTS
-from .util import is_model_name
+from .util import is_model, is_official_hub_path
 
 Tensor = Union['torch.Tensor', 'tf.Tensor']
-Input = Union[str, tuple, dict, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
+Input = Union[str, tuple, PyDataset, 'PIL.Image.Image', 'numpy.ndarray']
 InputModel = Union[str, Model]
 
 output_keys = [
@@ -29,14 +27,10 @@ class Pipeline(ABC):
 
     def initiate_single_model(self, model):
         logger.info(f'initiate model from {model}')
-        # TODO @wenmeng.zwm replace model.startswith('damo/') with get_model
-        if isinstance(model, str) and model.startswith('damo/'):
-            if not osp.exists(model):
-                cache_path = get_model_cache_dir(model)
-                model = cache_path if osp.exists(
-                    cache_path) else snapshot_download(model)
-            return Model.from_pretrained(model) if is_model_name(
-                model) else model
+        if isinstance(model, str) and is_official_hub_path(model):
+            model = snapshot_download(
+                model) if not osp.exists(model) else model
+            return Model.from_pretrained(model) if is_model(model) else model
         elif isinstance(model, Model):
             return model
         else:
@@ -104,7 +98,7 @@ class Pipeline(ABC):
 
     def _process_single(self, input: Input, *args,
                         **post_kwargs) -> Dict[str, Any]:
-        out = self.preprocess(input, **post_kwargs)
+        out = self.preprocess(input)
         out = self.forward(out)
         out = self.postprocess(out, **post_kwargs)
         self._check_output(out)
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index 6e2c791d..cff1801d 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -1,33 +1,49 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import os.path as osp
 from typing import List, Union
 
+from modelscope.metainfo import Pipelines
 from modelscope.models.base import Model
 from modelscope.utils.config import Config, ConfigDict
-from modelscope.utils.constant import Tasks
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.hub import read_config
 from modelscope.utils.registry import Registry, build_from_cfg
 from .base import Pipeline
+from .util import is_official_hub_path
 
 PIPELINES = Registry('pipelines')
 
 DEFAULT_MODEL_FOR_PIPELINE = {
     # TaskName: (pipeline_module_name, model_repo)
     Tasks.word_segmentation:
-    ('structbert-chinese-word-segmentation',
+    (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
     Tasks.sentence_similarity:
-    ('sbert-base-chinese-sentence-similarity',
+    (Pipelines.sentence_similarity,
      'damo/nlp_structbert_sentence-similarity_chinese-base'),
     Tasks.image_matting: ('image-matting', 'damo/cv_unet_image-matting'),
-    Tasks.text_classification:
-    ('bert-sentiment-analysis', 'damo/bert-base-sst2'),
-    Tasks.text_generation: ('palm2.0',
+    Tasks.nli: (Pipelines.nli, 'damo/nlp_structbert_nli_chinese-base'),
+    Tasks.sentiment_classification:
+    (Pipelines.sentiment_classification,
+     'damo/nlp_structbert_sentiment-classification_chinese-base'),
+    Tasks.text_classification: ('bert-sentiment-analysis',
+                                'damo/bert-base-sst2'),
+    Tasks.image_matting: (Pipelines.image_matting,
+                          'damo/cv_unet_image-matting'),
+    Tasks.text_classification: (Pipelines.sentiment_analysis,
+                                'damo/bert-base-sst2'),
+    Tasks.text_generation: (Pipelines.text_generation,
                             'damo/nlp_palm2.0_text-generation_chinese-base'),
-    Tasks.image_captioning: ('ofa', None),
+    Tasks.image_captioning: (Pipelines.image_caption,
+                             'damo/ofa_image-caption_coco_large_en'),
     Tasks.image_generation:
-    ('person-image-cartoon',
+    (Pipelines.person_image_cartoon,
      'damo/cv_unet_person-image-cartoon_compound-models'),
+    Tasks.ocr_detection: (Pipelines.ocr_detection,
+                          'damo/cv_resnet18_ocr-detection-line-level_damo'),
+    Tasks.fill_mask: (Pipelines.fill_mask, 'damo/nlp_veco_fill-mask-large'),
+    Tasks.action_recognition: (Pipelines.action_recognition,
+                               'damo/cv_TAdaConv_action-recognition'),
 }
 
 
@@ -84,30 +100,40 @@ def pipeline(task: str = None,
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
 
+    assert isinstance(model, (type(None), str, Model, list)), \
+        f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
+
     if pipeline_name is None:
         # get default pipeline for this task
         if isinstance(model, str) \
            or (isinstance(model, list) and isinstance(model[0], str)):
-
-            # if is_model_name(model):
-            if (isinstance(model, str) and model.startswith('damo/')) \
-               or (isinstance(model, list) and model[0].startswith('damo/')) \
-               or (isinstance(model, str) and osp.exists(model)):
-                # TODO @wenmeng.zwm  add support when model is a str of modelhub address
-                # read pipeline info from modelhub configuration file.
-                pipeline_name, default_model_repo = get_default_pipeline_info(
-                    task)
+            if is_official_hub_path(model):
+                # read config file from hub and parse
+                cfg = read_config(model) if isinstance(
+                    model, str) else read_config(model[0])
+                assert hasattr(
+                    cfg,
+                    'pipeline'), 'pipeline config is missing from config file.'
+                pipeline_name = cfg.pipeline.type
             else:
+                # used for test case, when model is str and is not hub path
                 pipeline_name = get_pipeline_by_model_name(task, model)
+        elif isinstance(model, Model) or \
+                (isinstance(model, list) and isinstance(model[0], Model)):
+            # get pipeline info from Model object
+            first_model = model[0] if isinstance(model, list) else model
+            if not hasattr(first_model, 'pipeline'):
+                # model is instantiated by user, we should parse config again
+                cfg = read_config(first_model.model_dir)
+                assert hasattr(
+                    cfg,
+                    'pipeline'), 'pipeline config is missing from config file.'
+                first_model.pipeline = cfg.pipeline
+            pipeline_name = first_model.pipeline.type
         else:
             pipeline_name, default_model_repo = get_default_pipeline_info(task)
-
-        if model is None:
             model = default_model_repo
 
-    assert isinstance(model, (type(None), str, Model, list)), \
-        f'model should be either None, str, List[str], Model, or List[Model], but got {type(model)}'
-
     cfg = ConfigDict(type=pipeline_name, model=model)
 
     if kwargs:
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 79c85c19..68d875ec 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -1,2 +1,4 @@
+from .action_recognition_pipeline import ActionRecognitionPipeline
 from .image_cartoon_pipeline import ImageCartoonPipeline
 from .image_matting_pipeline import ImageMattingPipeline
+from .ocr_detection_pipeline import OCRDetectionPipeline
diff --git a/modelscope/pipelines/cv/action_recognition_pipeline.py b/modelscope/pipelines/cv/action_recognition_pipeline.py
new file mode 100644
index 00000000..845f8f9a
--- /dev/null
+++ b/modelscope/pipelines/cv/action_recognition_pipeline.py
@@ -0,0 +1,65 @@
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.action_recognition.models import BaseVideoModel
+from modelscope.pipelines.base import Input
+from modelscope.preprocessors.video import ReadVideoData
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from ..base import Pipeline
+from ..builder import PIPELINES
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.action_recognition, module_name=Pipelines.action_recognition)
+class ActionRecognitionPipeline(Pipeline):
+
+    def __init__(self, model: str):
+        super().__init__(model=model)
+        model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model from {model_path}')
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        self.infer_model = BaseVideoModel(cfg=self.cfg).cuda()
+        self.infer_model.eval()
+        self.infer_model.load_state_dict(torch.load(model_path)['model_state'])
+        self.label_mapping = self.cfg.label_mapping
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            video_input_data = ReadVideoData(self.cfg, input).cuda()
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'video_data': video_input_data}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        pred = self.perform_inference(input['video_data'])
+        output_label = self.label_mapping[str(pred)]
+        return {'output_label': output_label}
+
+    @torch.no_grad()
+    def perform_inference(self, data, max_bsz=4):
+        iter_num = math.ceil(data.size(0) / max_bsz)
+        preds_list = []
+        for i in range(iter_num):
+            preds_list.append(
+                self.infer_model(data[i * max_bsz:(i + 1) * max_bsz])[0])
+        pred = torch.cat(preds_list, dim=0)
+        return pred.mean(dim=0).argmax().item()
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/image_cartoon_pipeline.py b/modelscope/pipelines/cv/image_cartoon_pipeline.py
index d253eaf5..717336e9 100644
--- a/modelscope/pipelines/cv/image_cartoon_pipeline.py
+++ b/modelscope/pipelines/cv/image_cartoon_pipeline.py
@@ -6,6 +6,7 @@ import numpy as np
 import PIL
 import tensorflow as tf
 
+from modelscope.metainfo import Pipelines
 from modelscope.models.cv.cartoon.facelib.facer import FaceAna
 from modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans import (
     get_reference_facial_points, warp_and_crop_face)
@@ -25,7 +26,7 @@ logger = get_logger()
 
 
 @PIPELINES.register_module(
-    Tasks.image_generation, module_name='person-image-cartoon')
+    Tasks.image_generation, module_name=Pipelines.person_image_cartoon)
 class ImageCartoonPipeline(Pipeline):
 
     def __init__(self, model: str):
diff --git a/modelscope/pipelines/cv/image_matting_pipeline.py b/modelscope/pipelines/cv/image_matting_pipeline.py
index 0c60dfa7..b3e27e4b 100644
--- a/modelscope/pipelines/cv/image_matting_pipeline.py
+++ b/modelscope/pipelines/cv/image_matting_pipeline.py
@@ -5,6 +5,7 @@ import cv2
 import numpy as np
 import PIL
 
+from modelscope.metainfo import Pipelines
 from modelscope.pipelines.base import Input
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import ModelFile, Tasks
@@ -16,7 +17,7 @@ logger = get_logger()
 
 
 @PIPELINES.register_module(
-    Tasks.image_matting, module_name=Tasks.image_matting)
+    Tasks.image_matting, module_name=Pipelines.image_matting)
 class ImageMattingPipeline(Pipeline):
 
     def __init__(self, model: str):
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
new file mode 100644
index 00000000..0502fe36
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -0,0 +1,168 @@
+import math
+import os
+import os.path as osp
+import sys
+from typing import Any, Dict, List, Tuple, Union
+
+import cv2
+import numpy as np
+import PIL
+import tensorflow as tf
+import tf_slim as slim
+
+from modelscope.metainfo import Pipelines
+from modelscope.pipelines.base import Input
+from modelscope.preprocessors import load_image
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from ..base import Pipeline
+from ..builder import PIPELINES
+from .ocr_utils import model_resnet_mutex_v4_linewithchar, ops, utils
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+tf.compat.v1.disable_eager_execution()
+
+logger = get_logger()
+
+# constant
+RBOX_DIM = 5
+OFFSET_DIM = 6
+WORD_POLYGON_DIM = 8
+OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
+
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_float('node_threshold', 0.4,
+                          'Confidence threshold for nodes')
+tf.app.flags.DEFINE_float('link_threshold', 0.6,
+                          'Confidence threshold for links')
+
+
+@PIPELINES.register_module(
+    Tasks.ocr_detection, module_name=Pipelines.ocr_detection)
+class OCRDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str):
+        super().__init__(model=model)
+        model_path = osp.join(
+            osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
+            'checkpoint-80000')
+
+        config = tf.ConfigProto(allow_soft_placement=True)
+        config.gpu_options.allow_growth = True
+        self._session = tf.Session(config=config)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            dtype=tf.int64,
+            trainable=False)
+        variable_averages = tf.train.ExponentialMovingAverage(
+            0.997, global_step)
+        self.input_images = tf.placeholder(
+            tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
+        self.output = {}
+
+        # detector
+        detector = model_resnet_mutex_v4_linewithchar.SegLinkDetector()
+        all_maps = detector.build_model(self.input_images, is_training=False)
+
+        # decode local predictions
+        all_nodes, all_links, all_reg = [], [], []
+        for i, maps in enumerate(all_maps):
+            cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[2]
+            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
+
+            cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
+
+            lnk_prob_pos = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, :2])
+            lnk_prob_mut = tf.nn.softmax(tf.reshape(lnk_maps, [-1, 4])[:, 2:])
+            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut], axis=1)
+
+            all_nodes.append(cls_prob)
+            all_links.append(lnk_prob)
+            all_reg.append(reg_maps)
+
+        # decode segments and links
+        image_size = tf.shape(self.input_images)[1:3]
+        segments, group_indices, segment_counts, _ = ops.decode_segments_links_python(
+            image_size,
+            all_nodes,
+            all_links,
+            all_reg,
+            anchor_sizes=list(detector.anchor_sizes))
+
+        # combine segments
+        combined_rboxes, combined_counts = ops.combine_segments_python(
+            segments, group_indices, segment_counts)
+        self.output['combined_rboxes'] = combined_rboxes
+        self.output['combined_counts'] = combined_counts
+
+        with self._session.as_default() as sess:
+            logger.info(f'loading model from {model_path}')
+            # load model
+            model_loader = tf.train.Saver(
+                variable_averages.variables_to_restore())
+            model_loader.restore(sess, model_path)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            img = np.array(load_image(input))
+        elif isinstance(input, PIL.Image.Image):
+            img = np.array(input.convert('RGB'))
+        elif isinstance(input, np.ndarray):
+            if len(input.shape) == 2:
+                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+            img = input[:, :, ::-1]  # in rgb order
+        else:
+            raise TypeError(f'input should be either str, PIL.Image,'
+                            f' np.array, but got {type(input)}')
+        h, w, c = img.shape
+        img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
+        img_pad[:h, :w, :] = img
+
+        resize_size = 1024
+        img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
+        img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
+        img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
+                                                   dtype=np.float32)
+
+        resize_size = tf.stack([resize_size, resize_size])
+        orig_size = tf.stack([max(h, w), max(h, w)])
+        self.output['orig_size'] = orig_size
+        self.output['resize_size'] = resize_size
+
+        result = {'img': np.expand_dims(img_pad_resize, axis=0)}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with self._session.as_default():
+            feed_dict = {self.input_images: input['img']}
+            sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
+            return sess_outputs
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        rboxes = inputs['combined_rboxes'][0]
+        count = inputs['combined_counts'][0]
+        rboxes = rboxes[:count, :]
+
+        # convert rboxes to polygons and find its coordinates on the original image
+        orig_h, orig_w = inputs['orig_size']
+        resize_h, resize_w = inputs['resize_size']
+        polygons = utils.rboxes_to_polygons(rboxes)
+        scale_y = float(orig_h) / float(resize_h)
+        scale_x = float(orig_w) / float(resize_w)
+
+        # confine polygons inside image
+        polygons[:, ::2] = np.maximum(
+            0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
+        polygons[:, 1::2] = np.maximum(
+            0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
+        polygons = np.round(polygons).astype(np.int32)
+
+        # nms
+        dt_n9 = [o + [utils.cal_width(o)] for o in polygons.tolist()]
+        dt_nms = utils.nms_python(dt_n9)
+        dt_polygons = np.array([o[:8] for o in dt_nms])
+
+        result = {'det_polygons': dt_polygons}
+        return result
diff --git a/modelscope/pipelines/cv/ocr_utils/__init__.py b/modelscope/pipelines/cv/ocr_utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
new file mode 100644
index 00000000..50b8ba02
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py
@@ -0,0 +1,158 @@
+import tensorflow as tf
+import tf_slim as slim
+
+from . import ops, resnet18_v1, resnet_utils
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
+# constants
+OFFSET_DIM = 6
+
+N_LOCAL_LINKS = 8
+N_CROSS_LINKS = 4
+N_SEG_CLASSES = 2
+N_LNK_CLASSES = 4
+
+POS_LABEL = 1
+NEG_LABEL = 0
+
+
+class SegLinkDetector():
+
+    def __init__(self):
+        self.anchor_sizes = [6., 11.84210526, 23.68421053, 45., 90., 150.]
+
+    def _detection_classifier(self,
+                              maps,
+                              ksize,
+                              weight_decay,
+                              cross_links=False,
+                              scope=None):
+
+        with tf.variable_scope(scope):
+            seg_depth = N_SEG_CLASSES
+            if cross_links:
+                lnk_depth = N_LNK_CLASSES * (N_LOCAL_LINKS + N_CROSS_LINKS)
+            else:
+                lnk_depth = N_LNK_CLASSES * N_LOCAL_LINKS
+            reg_depth = OFFSET_DIM
+            map_depth = maps.get_shape()[3]
+            inter_maps, inter_relu = ops.conv2d(
+                maps, map_depth, 256, 1, 1, 'SAME', scope='conv_inter')
+
+            dir_maps, dir_relu = ops.conv2d(
+                inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_dir')
+            cen_maps, cen_relu = ops.conv2d(
+                inter_relu, 256, 2, ksize, 1, 'SAME', scope='conv_cen')
+            pol_maps, pol_relu = ops.conv2d(
+                inter_relu, 256, 8, ksize, 1, 'SAME', scope='conv_pol')
+            concat_relu = tf.concat([dir_relu, cen_relu, pol_relu], axis=-1)
+            _, lnk_embedding = ops.conv_relu(
+                concat_relu, 12, 256, 1, 1, scope='lnk_embedding')
+            lnk_maps, lnk_relu = ops.conv2d(
+                inter_relu + lnk_embedding,
+                256,
+                lnk_depth,
+                ksize,
+                1,
+                'SAME',
+                scope='conv_lnk')
+
+            char_seg_maps, char_seg_relu = ops.conv2d(
+                inter_relu,
+                256,
+                seg_depth,
+                ksize,
+                1,
+                'SAME',
+                scope='conv_char_cls')
+            char_reg_maps, char_reg_relu = ops.conv2d(
+                inter_relu,
+                256,
+                reg_depth,
+                ksize,
+                1,
+                'SAME',
+                scope='conv_char_reg')
+            concat_char_relu = tf.concat([char_seg_relu, char_reg_relu],
+                                         axis=-1)
+            _, char_embedding = ops.conv_relu(
+                concat_char_relu, 8, 256, 1, 1, scope='conv_char_embedding')
+            seg_maps, seg_relu = ops.conv2d(
+                inter_relu + char_embedding,
+                256,
+                seg_depth,
+                ksize,
+                1,
+                'SAME',
+                scope='conv_cls')
+            reg_maps, reg_relu = ops.conv2d(
+                inter_relu + char_embedding,
+                256,
+                reg_depth,
+                ksize,
+                1,
+                'SAME',
+                scope='conv_reg')
+
+        return seg_relu, lnk_relu, reg_relu
+
+    def _build_cnn(self, images, weight_decay, is_training):
+        with slim.arg_scope(
+                resnet18_v1.resnet_arg_scope(weight_decay=weight_decay)):
+            logits, end_points = resnet18_v1.resnet_v1_18(
+                images, is_training=is_training, scope='resnet_v1_18')
+
+        outputs = {
+            'conv3_3': end_points['pool1'],
+            'conv4_3': end_points['pool2'],
+            'fc7': end_points['pool3'],
+            'conv8_2': end_points['pool4'],
+            'conv9_2': end_points['pool5'],
+            'conv10_2': end_points['pool6'],
+        }
+        return outputs
+
+    def build_model(self, images, is_training=True, scope=None):
+
+        weight_decay = 5e-4  # FLAGS.weight_decay
+        cnn_outputs = self._build_cnn(images, weight_decay, is_training)
+        det_0 = self._detection_classifier(
+            cnn_outputs['conv3_3'],
+            3,
+            weight_decay,
+            cross_links=False,
+            scope='dete_0')
+        det_1 = self._detection_classifier(
+            cnn_outputs['conv4_3'],
+            3,
+            weight_decay,
+            cross_links=True,
+            scope='dete_1')
+        det_2 = self._detection_classifier(
+            cnn_outputs['fc7'],
+            3,
+            weight_decay,
+            cross_links=True,
+            scope='dete_2')
+        det_3 = self._detection_classifier(
+            cnn_outputs['conv8_2'],
+            3,
+            weight_decay,
+            cross_links=True,
+            scope='dete_3')
+        det_4 = self._detection_classifier(
+            cnn_outputs['conv9_2'],
+            3,
+            weight_decay,
+            cross_links=True,
+            scope='dete_4')
+        det_5 = self._detection_classifier(
+            cnn_outputs['conv10_2'],
+            3,
+            weight_decay,
+            cross_links=True,
+            scope='dete_5')
+        outputs = [det_0, det_1, det_2, det_3, det_4, det_5]
+        return outputs
diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py
new file mode 100644
index 00000000..2bc8a8bf
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/ops.py
@@ -0,0 +1,1098 @@
+import math
+import os
+import shutil
+import uuid
+
+import cv2
+import numpy as np
+import tensorflow as tf
+
+from . import utils
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_string('weight_init_method', 'xavier',
+                           'Weight initialization method')
+
+# constants
+OFFSET_DIM = 6
+RBOX_DIM = 5
+
+N_LOCAL_LINKS = 8
+N_CROSS_LINKS = 4
+N_SEG_CLASSES = 2
+N_LNK_CLASSES = 4
+
+MATCH_STATUS_POS = 1
+MATCH_STATUS_NEG = -1
+MATCH_STATUS_IGNORE = 0
+MUT_LABEL = 3
+POS_LABEL = 1
+NEG_LABEL = 0
+
+N_DET_LAYERS = 6
+
+
+def load_oplib(lib_name):
+    """
+  Load TensorFlow operator library.
+  """
+    # use absolute path so that ops.py can be called from other directory
+    lib_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        'lib{0}.so'.format(lib_name))
+    # duplicate library with a random new name so that
+    # a running program will not be interrupted when the original library is updated
+    lib_copy_path = '/tmp/lib{0}_{1}.so'.format(
+        str(uuid.uuid4())[:8], LIB_NAME)
+    shutil.copyfile(lib_path, lib_copy_path)
+    oplib = tf.load_op_library(lib_copy_path)
+    return oplib
+
+
+def _nn_variable(name, shape, init_method, collection=None, **kwargs):
+    """
+  Create or reuse a variable
+  ARGS
+    name: variable name
+    shape: variable shape
+    init_method: 'zero', 'kaiming', 'xavier', or (mean, std)
+    collection: if not none, add variable to this collection
+    kwargs: extra paramters passed to tf.get_variable
+  RETURN
+    var: a new or existing variable
+  """
+    if init_method == 'zero':
+        initializer = tf.constant_initializer(0.0)
+    elif init_method == 'kaiming':
+        if len(shape) == 4:  # convolutional filters
+            kh, kw, n_in = shape[:3]
+            init_std = math.sqrt(2.0 / (kh * kw * n_in))
+        elif len(shape) == 2:  # linear weights
+            n_in, n_out = shape
+            init_std = math.sqrt(1.0 / n_out)
+        else:
+            raise 'Unsupported shape'
+        initializer = tf.truncated_normal_initializer(0.0, init_std)
+    elif init_method == 'xavier':
+        if len(shape) == 4:
+            initializer = tf.keras.initializers.glorot_normal()
+        else:
+            initializer = tf.keras.initializers.glorot_normal()
+    elif isinstance(init_method, tuple):
+        assert (len(init_method) == 2)
+        initializer = tf.truncated_normal_initializer(init_method[0],
+                                                      init_method[1])
+    else:
+        raise 'Unsupported weight initialization method: ' + init_method
+
+    var = tf.get_variable(name, shape=shape, initializer=initializer, **kwargs)
+    if collection is not None:
+        tf.add_to_collection(collection, var)
+
+    return var
+
+
+def conv2d(x,
+           n_in,
+           n_out,
+           ksize,
+           stride=1,
+           padding='SAME',
+           weight_init=None,
+           bias=True,
+           relu=False,
+           scope=None,
+           **kwargs):
+    weight_init = weight_init or FLAGS.weight_init_method
+    trainable = kwargs.get('trainable', True)
+    # input_dim = n_in
+    if (padding == 'SAME'):
+        in_height = x.get_shape()[1]
+        in_width = x.get_shape()[2]
+        if (in_height % stride == 0):
+            pad_along_height = max(ksize - stride, 0)
+        else:
+            pad_along_height = max(ksize - (in_height % stride), 0)
+        if (in_width % stride == 0):
+            pad_along_width = max(ksize - stride, 0)
+        else:
+            pad_along_width = max(ksize - (in_width % stride), 0)
+        pad_bottom = pad_along_height // 2
+        pad_top = pad_along_height - pad_bottom
+        pad_right = pad_along_width // 2
+        pad_left = pad_along_width - pad_right
+        paddings = tf.constant([[0, 0], [pad_top, pad_bottom],
+                                [pad_left, pad_right], [0, 0]])
+        input_padded = tf.pad(x, paddings, 'CONSTANT')
+    else:
+        input_padded = x
+
+    with tf.variable_scope(scope or 'conv2d'):
+        # convolution
+        kernel = _nn_variable(
+            'weight', [ksize, ksize, n_in, n_out],
+            weight_init,
+            collection='weights' if trainable else None,
+            **kwargs)
+        yc = tf.nn.conv2d(
+            input_padded, kernel, [1, stride, stride, 1], padding='VALID')
+        # add bias
+        if bias is True:
+            bias = _nn_variable(
+                'bias', [n_out],
+                'zero',
+                collection='biases' if trainable else None,
+                **kwargs)
+            yb = tf.nn.bias_add(yc, bias)
+        # apply ReLU
+        y = yb
+        if relu is True:
+            y = tf.nn.relu(yb)
+    return yb, y
+
+
+def group_conv2d_relu(x,
+                      n_in,
+                      n_out,
+                      ksize,
+                      stride=1,
+                      group=4,
+                      padding='SAME',
+                      weight_init=None,
+                      bias=True,
+                      relu=False,
+                      name='group_conv2d',
+                      **kwargs):
+    group_axis = len(x.get_shape()) - 1
+    splits = tf.split(x, [int(n_in / group)] * group, group_axis)
+
+    conv_list = []
+    for i in range(group):
+        conv_split, relu_split = conv2d(
+            splits[i],
+            n_in / group,
+            n_out / group,
+            ksize=ksize,
+            stride=stride,
+            padding=padding,
+            weight_init=weight_init,
+            bias=bias,
+            relu=relu,
+            scope='%s_%d' % (name, i))
+        conv_list.append(conv_split)
+    conv = tf.concat(values=conv_list, axis=group_axis, name=name + '_concat')
+    relu = tf.nn.relu(conv)
+    return conv, relu
+
+
+def group_conv2d_bn_relu(x,
+                         n_in,
+                         n_out,
+                         ksize,
+                         stride=1,
+                         group=4,
+                         padding='SAME',
+                         weight_init=None,
+                         bias=True,
+                         relu=False,
+                         name='group_conv2d',
+                         **kwargs):
+    group_axis = len(x.get_shape()) - 1
+    splits = tf.split(x, [int(n_in / group)] * group, group_axis)
+
+    conv_list = []
+    for i in range(group):
+        conv_split, relu_split = conv2d(
+            splits[i],
+            n_in / group,
+            n_out / group,
+            ksize=ksize,
+            stride=stride,
+            padding=padding,
+            weight_init=weight_init,
+            bias=bias,
+            relu=relu,
+            scope='%s_%d' % (name, i))
+        conv_list.append(conv_split)
+    conv = tf.concat(values=conv_list, axis=group_axis, name=name + '_concat')
+    with tf.variable_scope(name + '_bn'):
+        bn = tf.layers.batch_normalization(
+            conv, momentum=0.9, epsilon=1e-5, scale=True, training=True)
+    relu = tf.nn.relu(bn)
+    return conv, relu
+
+
+def next_conv(x,
+              n_in,
+              n_out,
+              ksize,
+              stride=1,
+              group=4,
+              padding='SAME',
+              weight_init=None,
+              bias=True,
+              relu=False,
+              name='next_conv2d',
+              **kwargs):
+    conv_a, relu_a = conv_relu(
+        x,
+        n_in,
+        n_in / 2,
+        ksize=1,
+        stride=1,
+        padding=padding,
+        weight_init=weight_init,
+        bias=bias,
+        relu=relu,
+        scope=name + '_a',
+        **kwargs)
+
+    conv_b, relu_b = group_conv2d_relu(
+        relu_a,
+        n_in / 2,
+        n_out / 2,
+        ksize=ksize,
+        stride=stride,
+        group=group,
+        padding=padding,
+        weight_init=weight_init,
+        bias=bias,
+        relu=relu,
+        name=name + '_b',
+        **kwargs)
+
+    conv_c, relu_c = conv_relu(
+        relu_b,
+        n_out / 2,
+        n_out,
+        ksize=1,
+        stride=1,
+        padding=padding,
+        weight_init=weight_init,
+        bias=bias,
+        relu=relu,
+        scope=name + '_c',
+        **kwargs)
+
+    return conv_c, relu_c
+
+
+def next_conv_bn(x,
+                 n_in,
+                 n_out,
+                 ksize,
+                 stride=1,
+                 group=4,
+                 padding='SAME',
+                 weight_init=None,
+                 bias=True,
+                 relu=False,
+                 name='next_conv2d',
+                 **kwargs):
+    conv_a, relu_a = conv_bn_relu(
+        x,
+        n_in,
+        n_in / 2,
+        ksize=1,
+        stride=1,
+        padding=padding,
+        weight_init=weight_init,
+        bias=bias,
+        relu=relu,
+        scope=name + '_a',
+        **kwargs)
+
+    conv_b, relu_b = group_conv2d_bn_relu(
+        relu_a,
+        n_in / 2,
+        n_out / 2,
+        ksize=ksize,
+        stride=stride,
+        group=group,
+        padding=padding,
+        weight_init=weight_init,
+        bias=bias,
+        relu=relu,
+        name=name + '_b',
+        **kwargs)
+
+    conv_c, relu_c = conv_bn_relu(
+        relu_b,
+        n_out / 2,
+        n_out,
+        ksize=1,
+        stride=1,
+        padding=padding,
+        weight_init=weight_init,
+        bias=bias,
+        relu=relu,
+        scope=name + '_c',
+        **kwargs)
+
+    return conv_c, relu_c
+
+
+def conv2d_ori(x,
+               n_in,
+               n_out,
+               ksize,
+               stride=1,
+               padding='SAME',
+               weight_init=None,
+               bias=True,
+               relu=False,
+               scope=None,
+               **kwargs):
+    weight_init = weight_init or FLAGS.weight_init_method
+    trainable = kwargs.get('trainable', True)
+
+    with tf.variable_scope(scope or 'conv2d'):
+        # convolution
+        kernel = _nn_variable(
+            'weight', [ksize, ksize, n_in, n_out],
+            weight_init,
+            collection='weights' if trainable else None,
+            **kwargs)
+        y = tf.nn.conv2d(x, kernel, [1, stride, stride, 1], padding=padding)
+        # add bias
+        if bias is True:
+            bias = _nn_variable(
+                'bias', [n_out],
+                'zero',
+                collection='biases' if trainable else None,
+                **kwargs)
+            y = tf.nn.bias_add(y, bias)
+        # apply ReLU
+        if relu is True:
+            y = tf.nn.relu(y)
+    return y
+
+
+def conv_relu(*args, **kwargs):
+    kwargs['relu'] = True
+    if 'scope' not in kwargs:
+        kwargs['scope'] = 'conv_relu'
+    return conv2d(*args, **kwargs)
+
+
+def conv_bn_relu(*args, **kwargs):
+    kwargs['relu'] = True
+    if 'scope' not in kwargs:
+        kwargs['scope'] = 'conv_relu'
+    conv, relu = conv2d(*args, **kwargs)
+    with tf.variable_scope(kwargs['scope'] + '_bn'):
+        bn = tf.layers.batch_normalization(
+            conv, momentum=0.9, epsilon=1e-5, scale=True, training=True)
+    bn_relu = tf.nn.relu(bn)
+    return bn, bn_relu
+
+
+def conv_relu_ori(*args, **kwargs):
+    kwargs['relu'] = True
+    if 'scope' not in kwargs:
+        kwargs['scope'] = 'conv_relu'
+    return conv2d_ori(*args, **kwargs)
+
+
+def atrous_conv2d(x,
+                  n_in,
+                  n_out,
+                  ksize,
+                  dilation,
+                  padding='SAME',
+                  weight_init=None,
+                  bias=True,
+                  relu=False,
+                  scope=None,
+                  **kwargs):
+    weight_init = weight_init or FLAGS.weight_init_method
+    trainable = kwargs.get('trainable', True)
+    with tf.variable_scope(scope or 'atrous_conv2d'):
+        # atrous convolution
+        kernel = _nn_variable(
+            'weight', [ksize, ksize, n_in, n_out],
+            weight_init,
+            collection='weights' if trainable else None,
+            **kwargs)
+        y = tf.nn.atrous_conv2d(x, kernel, dilation, padding=padding)
+        # add bias
+        if bias is True:
+            bias = _nn_variable(
+                'bias', [n_out],
+                'zero',
+                collection='biases' if trainable else None,
+                **kwargs)
+            y = tf.nn.bias_add(y, bias)
+        # apply ReLU
+        if relu is True:
+            y = tf.nn.relu(y)
+        return y
+
+
+def avg_pool(x, ksize, stride, padding='SAME', scope=None):
+    with tf.variable_scope(scope or 'avg_pool'):
+        y = tf.nn.avg_pool(x, [1, ksize, ksize, 1], [1, stride, stride, 1],
+                           padding)
+    return y
+
+
+def max_pool(x, ksize, stride, padding='SAME', scope=None):
+    with tf.variable_scope(scope or 'max_pool'):
+        y = tf.nn.max_pool(x, [1, ksize, ksize, 1], [1, stride, stride, 1],
+                           padding)
+    return y
+
+
+def score_loss(gt_labels, match_scores, n_classes):
+    """
+  Classification loss
+  ARGS
+    gt_labels: int32 [n]
+    match_scores: [n, n_classes]
+  RETURN
+    loss
+  """
+    embeddings = tf.one_hot(tf.cast(gt_labels, tf.int64), n_classes, 1.0, 0.0)
+    losses = tf.nn.softmax_cross_entropy_with_logits(match_scores, embeddings)
+    return tf.reduce_sum(losses)
+
+
+def smooth_l1_loss(offsets, gt_offsets, scope=None):
+    """
+  Smooth L1 loss between offsets and encoded_gt
+  ARGS
+    offsets: [m?, 5], predicted offsets for one example
+    gt_offsets: [m?, 5], correponding groundtruth offsets
+  RETURN
+    loss: scalar
+  """
+    with tf.variable_scope(scope or 'smooth_l1_loss'):
+        gt_offsets = tf.stop_gradient(gt_offsets)
+        diff = tf.abs(offsets - gt_offsets)
+        lesser_mask = tf.cast(tf.less(diff, 1.0), tf.float32)
+        larger_mask = 1.0 - lesser_mask
+        losses1 = (0.5 * tf.square(diff)) * lesser_mask
+        losses2 = (diff - 0.5) * larger_mask
+        return tf.reduce_sum(losses1 + losses2, 1)
+
+
+def polygon_to_rboxe(polygon):
+    x1 = polygon[0]
+    y1 = polygon[1]
+    x2 = polygon[2]
+    y2 = polygon[3]
+    x3 = polygon[4]
+    y3 = polygon[5]
+    x4 = polygon[6]
+    y4 = polygon[7]
+    c_x = (x1 + x2 + x3 + x4) / 4
+    c_y = (y1 + y2 + y3 + y4) / 4
+    w1 = point_dist(x1, y1, x2, y2)
+    w2 = point_dist(x3, y3, x4, y4)
+    h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2)
+    h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4)
+    h = h1 + h2
+    w = (w1 + w2) / 2
+    theta1 = np.arctan2(y2 - y1, x2 - x1)
+    theta2 = np.arctan2(y3 - y4, x3 - x4)
+    theta = (theta1 + theta2) / 2
+    return np.array([c_x, c_y, w, h, theta])
+
+
+def point_dist(x1, y1, x2, y2):
+    return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1))
+
+
+def point_line_dist(px, py, x1, y1, x2, y2):
+    eps = 1e-6
+    dx = x2 - x1
+    dy = y2 - y1
+    div = np.sqrt(dx * dx + dy * dy) + eps
+    dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
+    return dist
+
+
+def get_combined_polygon(rboxes, resize_size):
+    image_w = resize_size[1]
+    image_h = resize_size[0]
+    img = np.zeros((image_h, image_w, 3), np.uint8)
+    for i in range(rboxes.shape[0]):
+        segment = np.reshape(
+            np.array(utils.rboxes_to_polygons(rboxes)[i, :], np.int32),
+            (-1, 1, 2))
+        cv2.drawContours(img, [segment], 0, (255, 255, 255), -1)
+    img2gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    ret, thresh = cv2.threshold(img2gray, 127, 255, cv2.THRESH_BINARY)
+    im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE,
+                                                cv2.CHAIN_APPROX_SIMPLE)
+    if len(contours) > 0:
+        cnt = contours[0]
+        max_area = cv2.contourArea(cnt)
+        # get max_area
+        for cont in contours:
+            if cv2.contourArea(cont) > max_area:
+                cnt = cont
+                max_area = cv2.contourArea(cont)
+        rect = cv2.minAreaRect(cnt)
+        combined_polygon = np.array(cv2.boxPoints(rect)).reshape(-1)
+    else:
+        combined_polygon = np.array([0, 0, 0, 0, 0, 0, 0, 0])
+
+    return combined_polygon
+
+
+def combine_segs(segs):
+    segs = np.asarray(segs)
+    assert segs.ndim == 2, 'invalid segs ndim'
+    assert segs.shape[-1] == 6, 'invalid segs shape'
+
+    if len(segs) == 1:
+        cx = segs[0, 0]
+        cy = segs[0, 1]
+        w = segs[0, 2]
+        h = segs[0, 3]
+        theta_sin = segs[0, 4]
+        theta_cos = segs[0, 5]
+        theta = np.arctan2(theta_sin, theta_cos)
+        return np.array([cx, cy, w, h, theta])
+
+    # find the best straight line fitting all center points: y = kx + b
+    cxs = segs[:, 0]
+    cys = segs[:, 1]
+
+    theta_coss = segs[:, 4]
+    theta_sins = segs[:, 5]
+
+    bar_theta = np.arctan2(theta_sins.sum(), theta_coss.sum())
+    k = np.tan(bar_theta)
+    b = np.mean(cys - k * cxs)
+
+    proj_xs = (k * cys + cxs - k * b) / (k**2 + 1)
+    proj_ys = (k * k * cys + k * cxs + b) / (k**2 + 1)
+    proj_points = np.stack((proj_xs, proj_ys), -1)
+
+    # find the max distance
+    max_dist = -1
+    idx1 = -1
+    idx2 = -1
+
+    for i in range(len(proj_points)):
+        point1 = proj_points[i, :]
+        for j in range(i + 1, len(proj_points)):
+            point2 = proj_points[j, :]
+            dist = np.sqrt(np.sum((point1 - point2)**2))
+            if dist > max_dist:
+                idx1 = i
+                idx2 = j
+                max_dist = dist
+    assert idx1 >= 0 and idx2 >= 0
+    # the bbox: bcx, bcy, bw, bh, average_theta
+    seg1 = segs[idx1, :]
+    seg2 = segs[idx2, :]
+    bcx, bcy = (seg1[:2] + seg2[:2]) / 2.0
+    bh = np.mean(segs[:, 3])
+    bw = max_dist + (seg1[2] + seg2[2]) / 2.0
+    return bcx, bcy, bw, bh, bar_theta
+
+
+def combine_segments_batch(segments_batch, group_indices_batch,
+                           segment_counts_batch):
+    batch_size = 1
+    combined_rboxes_batch = []
+    combined_counts_batch = []
+    for image_id in range(batch_size):
+        group_count = segment_counts_batch[image_id]
+        segments = segments_batch[image_id, :, :]
+        group_indices = group_indices_batch[image_id, :]
+        combined_rboxes = []
+        for i in range(group_count):
+            segments_group = segments[np.where(group_indices == i)[0], :]
+            if segments_group.shape[0] > 0:
+                combined_rbox = combine_segs(segments_group)
+                combined_rboxes.append(combined_rbox)
+        combined_rboxes_batch.append(combined_rboxes)
+        combined_counts_batch.append(len(combined_rboxes))
+
+    max_count = np.max(combined_counts_batch)
+    for image_id in range(batch_size):
+        if not combined_counts_batch[image_id] == max_count:
+            combined_rboxes_pad = (max_count - combined_counts_batch[image_id]
+                                   ) * [RBOX_DIM * [0.0]]
+            combined_rboxes_batch[image_id] = np.vstack(
+                (combined_rboxes_batch[image_id],
+                 np.array(combined_rboxes_pad)))
+
+    return np.asarray(combined_rboxes_batch,
+                      np.float32), np.asarray(combined_counts_batch, np.int32)
+
+
+# combine_segments rewrite in python version
+def combine_segments_python(segments, group_indices, segment_counts):
+    combined_rboxes, combined_counts = tf.py_func(
+        combine_segments_batch, [segments, group_indices, segment_counts],
+        [tf.float32, tf.int32])
+    return combined_rboxes, combined_counts
+
+
+# decode_segments_links rewrite in python version
+def get_coord(offsets, map_size, offsets_defaults):
+    if offsets < offsets_defaults[1][0]:
+        l_idx = 0
+        x = offsets % map_size[0][1]
+        y = offsets // map_size[0][1]
+    elif offsets < offsets_defaults[2][0]:
+        l_idx = 1
+        x = (offsets - offsets_defaults[1][0]) % map_size[1][1]
+        y = (offsets - offsets_defaults[1][0]) // map_size[1][1]
+    elif offsets < offsets_defaults[3][0]:
+        l_idx = 2
+        x = (offsets - offsets_defaults[2][0]) % map_size[2][1]
+        y = (offsets - offsets_defaults[2][0]) // map_size[2][1]
+    elif offsets < offsets_defaults[4][0]:
+        l_idx = 3
+        x = (offsets - offsets_defaults[3][0]) % map_size[3][1]
+        y = (offsets - offsets_defaults[3][0]) // map_size[3][1]
+    elif offsets < offsets_defaults[5][0]:
+        l_idx = 4
+        x = (offsets - offsets_defaults[4][0]) % map_size[4][1]
+        y = (offsets - offsets_defaults[4][0]) // map_size[4][1]
+    else:
+        l_idx = 5
+        x = (offsets - offsets_defaults[5][0]) % map_size[5][1]
+        y = (offsets - offsets_defaults[5][0]) // map_size[5][1]
+
+    return l_idx, x, y
+
+
+def get_coord_link(offsets, map_size, offsets_defaults):
+    if offsets < offsets_defaults[1][1]:
+        offsets_node = offsets // N_LOCAL_LINKS
+        link_idx = offsets % N_LOCAL_LINKS
+    else:
+        offsets_node = (offsets - offsets_defaults[1][1]) // (
+            N_LOCAL_LINKS + N_CROSS_LINKS) + offsets_defaults[1][0]
+        link_idx = (offsets - offsets_defaults[1][1]) % (
+            N_LOCAL_LINKS + N_CROSS_LINKS)
+    l_idx, x, y = get_coord(offsets_node, map_size, offsets_defaults)
+    return l_idx, x, y, link_idx
+
+
+def is_valid_coord(l_idx, x, y, map_size):
+    w = map_size[l_idx][1]
+    h = map_size[l_idx][0]
+    return x >= 0 and x < w and y >= 0 and y < h
+
+
+def get_neighbours(l_idx, x, y, map_size, offsets_defaults):
+    if l_idx == 0:
+        coord = [(0, x - 1, y - 1), (0, x, y - 1), (0, x + 1, y - 1),
+                 (0, x - 1, y), (0, x + 1, y), (0, x - 1, y + 1),
+                 (0, x, y + 1), (0, x + 1, y + 1)]
+    else:
+        coord = [(l_idx, x - 1, y - 1),
+                 (l_idx, x, y - 1), (l_idx, x + 1, y - 1), (l_idx, x - 1, y),
+                 (l_idx, x + 1, y), (l_idx, x - 1, y + 1), (l_idx, x, y + 1),
+                 (l_idx, x + 1, y + 1), (l_idx - 1, 2 * x, 2 * y),
+                 (l_idx - 1, 2 * x + 1, 2 * y), (l_idx - 1, 2 * x, 2 * y + 1),
+                 (l_idx - 1, 2 * x + 1, 2 * y + 1)]
+    neighbours_offsets = []
+    link_idx = 0
+    for nl_idx, nx, ny in coord:
+        if is_valid_coord(nl_idx, nx, ny, map_size):
+            neighbours_offset_node = offsets_defaults[nl_idx][
+                0] + map_size[nl_idx][1] * ny + nx
+            if l_idx == 0:
+                neighbours_offset_link = offsets_defaults[l_idx][1] + (
+                    map_size[l_idx][1] * y + x) * N_LOCAL_LINKS + link_idx
+            else:
+                off_tmp = (map_size[l_idx][1] * y + x) * (
+                    N_LOCAL_LINKS + N_CROSS_LINKS)
+                neighbours_offset_link = offsets_defaults[l_idx][
+                    1] + off_tmp + link_idx
+            neighbours_offsets.append(
+                [neighbours_offset_node, neighbours_offset_link, link_idx])
+        link_idx += 1
+    # [node_offsets, link_offsets, link_idx(0-7/11)]
+    return neighbours_offsets
+
+
+def decode_segments_links_python(image_size, all_nodes, all_links, all_reg,
+                                 anchor_sizes):
+    batch_size = 1  # FLAGS.test_batch_size
+    # offsets = 12285 #768
+    all_nodes_flat = tf.concat(
+        [tf.reshape(o, [batch_size, -1, N_SEG_CLASSES]) for o in all_nodes],
+        axis=1)
+    all_links_flat = tf.concat(
+        [tf.reshape(o, [batch_size, -1, N_LNK_CLASSES]) for o in all_links],
+        axis=1)
+    all_reg_flat = tf.concat(
+        [tf.reshape(o, [batch_size, -1, OFFSET_DIM]) for o in all_reg], axis=1)
+    segments, group_indices, segment_counts, group_indices_all = tf.py_func(
+        decode_batch, [
+            all_nodes_flat, all_links_flat, all_reg_flat, image_size,
+            tf.constant(anchor_sizes)
+        ], [tf.float32, tf.int32, tf.int32, tf.int32])
+    return segments, group_indices, segment_counts, group_indices_all
+
+
+def decode_segments_links_train(image_size, all_nodes, all_links, all_reg,
+                                anchor_sizes):
+    batch_size = FLAGS.train_batch_size
+    # offsets = 12285 #768
+    all_nodes_flat = tf.concat(
+        [tf.reshape(o, [batch_size, -1, N_SEG_CLASSES]) for o in all_nodes],
+        axis=1)
+    all_links_flat = tf.concat(
+        [tf.reshape(o, [batch_size, -1, N_LNK_CLASSES]) for o in all_links],
+        axis=1)
+    all_reg_flat = tf.concat(
+        [tf.reshape(o, [batch_size, -1, OFFSET_DIM]) for o in all_reg], axis=1)
+    segments, group_indices, segment_counts, group_indices_all = tf.py_func(
+        decode_batch, [
+            all_nodes_flat, all_links_flat, all_reg_flat, image_size,
+            tf.constant(anchor_sizes)
+        ], [tf.float32, tf.int32, tf.int32, tf.int32])
+    return segments, group_indices, segment_counts, group_indices_all
+
+
+def decode_batch(all_nodes, all_links, all_reg, image_size, anchor_sizes):
+    batch_size = all_nodes.shape[0]
+    batch_segments = []
+    batch_group_indices = []
+    batch_segments_counts = []
+    batch_group_indices_all = []
+    for image_id in range(batch_size):
+        image_node_scores = all_nodes[image_id, :, :]
+        image_link_scores = all_links[image_id, :, :]
+        image_reg = all_reg[image_id, :, :]
+        image_segments, image_group_indices, image_segments_counts, image_group_indices_all = decode_image(
+            image_node_scores, image_link_scores, image_reg, image_size,
+            anchor_sizes)
+        batch_segments.append(image_segments)
+        batch_group_indices.append(image_group_indices)
+        batch_segments_counts.append(image_segments_counts)
+        batch_group_indices_all.append(image_group_indices_all)
+    max_count = np.max(batch_segments_counts)
+    for image_id in range(batch_size):
+        if not batch_segments_counts[image_id] == max_count:
+            batch_segments_pad = (max_count - batch_segments_counts[image_id]
+                                  ) * [OFFSET_DIM * [0.0]]
+            batch_segments[image_id] = np.vstack(
+                (batch_segments[image_id], np.array(batch_segments_pad)))
+            batch_group_indices[image_id] = np.hstack(
+                (batch_group_indices[image_id],
+                 np.array(
+                     (max_count - batch_segments_counts[image_id]) * [-1])))
+    return np.asarray(batch_segments, np.float32), np.asarray(
+        batch_group_indices,
+        np.int32), np.asarray(batch_segments_counts,
+                              np.int32), np.asarray(batch_group_indices_all,
+                                                    np.int32)
+
+
+def decode_image(image_node_scores, image_link_scores, image_reg, image_size,
+                 anchor_sizes):
+    map_size = []
+    offsets_defaults = []
+    offsets_default_node = 0
+    offsets_default_link = 0
+    for i in range(N_DET_LAYERS):
+        offsets_defaults.append([offsets_default_node, offsets_default_link])
+        map_size.append(image_size // (2**(2 + i)))
+        offsets_default_node += map_size[i][0] * map_size[i][1]
+        if i == 0:
+            offsets_default_link += map_size[i][0] * map_size[i][
+                1] * N_LOCAL_LINKS
+        else:
+            offsets_default_link += map_size[i][0] * map_size[i][1] * (
+                N_LOCAL_LINKS + N_CROSS_LINKS)
+
+    image_group_indices_all = decode_image_by_join(image_node_scores,
+                                                   image_link_scores,
+                                                   FLAGS.node_threshold,
+                                                   FLAGS.link_threshold,
+                                                   map_size, offsets_defaults)
+    image_group_indices_all -= 1
+    image_group_indices = image_group_indices_all[np.where(
+        image_group_indices_all >= 0)[0]]
+    image_segments_counts = len(image_group_indices)
+    # convert image_reg to segments with scores(OFFSET_DIM+1)
+    image_segments = np.zeros((image_segments_counts, OFFSET_DIM),
+                              dtype=np.float32)
+    for i, offsets in enumerate(np.where(image_group_indices_all >= 0)[0]):
+        encoded_cx = image_reg[offsets, 0]
+        encoded_cy = image_reg[offsets, 1]
+        encoded_width = image_reg[offsets, 2]
+        encoded_height = image_reg[offsets, 3]
+        encoded_theta_cos = image_reg[offsets, 4]
+        encoded_theta_sin = image_reg[offsets, 5]
+
+        l_idx, x, y = get_coord(offsets, map_size, offsets_defaults)
+        rs = anchor_sizes[l_idx]
+        eps = 1e-6
+        image_segments[i, 0] = encoded_cx * rs + (2**(2 + l_idx)) * (x + 0.5)
+        image_segments[i, 1] = encoded_cy * rs + (2**(2 + l_idx)) * (y + 0.5)
+        image_segments[i, 2] = np.exp(encoded_width) * rs - eps
+        image_segments[i, 3] = np.exp(encoded_height) * rs - eps
+        image_segments[i, 4] = encoded_theta_cos
+        image_segments[i, 5] = encoded_theta_sin
+
+    return image_segments, image_group_indices, image_segments_counts, image_group_indices_all
+
+
+def decode_image_by_join(node_scores, link_scores, node_threshold,
+                         link_threshold, map_size, offsets_defaults):
+    node_mask = node_scores[:, POS_LABEL] >= node_threshold
+    link_mask = link_scores[:, POS_LABEL] >= link_threshold
+    group_mask = np.zeros_like(node_mask, np.int32) - 1
+    offsets_pos = np.where(node_mask == 1)[0]
+
+    def find_parent(point):
+        return group_mask[point]
+
+    def set_parent(point, parent):
+        group_mask[point] = parent
+
+    def is_root(point):
+        return find_parent(point) == -1
+
+    def find_root(point):
+        root = point
+        update_parent = False
+        while not is_root(root):
+            root = find_parent(root)
+            update_parent = True
+
+        # for acceleration of find_root
+        if update_parent:
+            set_parent(point, root)
+
+        return root
+
+    def join(p1, p2):
+        root1 = find_root(p1)
+        root2 = find_root(p2)
+
+        if root1 != root2:
+            set_parent(root1, root2)
+
+    def get_all():
+        root_map = {}
+
+        def get_index(root):
+            if root not in root_map:
+                root_map[root] = len(root_map) + 1
+            return root_map[root]
+
+        mask = np.zeros_like(node_mask, dtype=np.int32)
+        for i, point in enumerate(offsets_pos):
+            point_root = find_root(point)
+            bbox_idx = get_index(point_root)
+            mask[point] = bbox_idx
+        return mask
+
+    # join by link
+    pos_link = 0
+    for i, offsets in enumerate(offsets_pos):
+        l_idx, x, y = get_coord(offsets, map_size, offsets_defaults)
+        neighbours = get_neighbours(l_idx, x, y, map_size, offsets_defaults)
+        for n_idx, noffsets in enumerate(neighbours):
+            link_value = link_mask[noffsets[1]]
+            node_cls = node_mask[noffsets[0]]
+            if link_value and node_cls:
+                pos_link += 1
+                join(offsets, noffsets[0])
+    # print(pos_link)
+    mask = get_all()
+    return mask
+
+
+def get_link_mask(node_mask, offsets_defaults, link_max):
+    link_mask = np.zeros_like(link_max)
+    link_mask[0:offsets_defaults[1][1]] = np.tile(
+        node_mask[0:offsets_defaults[1][0]],
+        (N_LOCAL_LINKS, 1)).transpose().reshape(offsets_defaults[1][1])
+    link_mask[offsets_defaults[1][1]:offsets_defaults[2][1]] = np.tile(
+        node_mask[offsets_defaults[1][0]:offsets_defaults[2][0]],
+        (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape(
+            (offsets_defaults[2][1] - offsets_defaults[1][1]))
+    link_mask[offsets_defaults[2][1]:offsets_defaults[3][1]] = np.tile(
+        node_mask[offsets_defaults[2][0]:offsets_defaults[3][0]],
+        (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape(
+            (offsets_defaults[3][1] - offsets_defaults[2][1]))
+    link_mask[offsets_defaults[3][1]:offsets_defaults[4][1]] = np.tile(
+        node_mask[offsets_defaults[3][0]:offsets_defaults[4][0]],
+        (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape(
+            (offsets_defaults[4][1] - offsets_defaults[3][1]))
+    link_mask[offsets_defaults[4][1]:offsets_defaults[5][1]] = np.tile(
+        node_mask[offsets_defaults[4][0]:offsets_defaults[5][0]],
+        (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape(
+            (offsets_defaults[5][1] - offsets_defaults[4][1]))
+    link_mask[offsets_defaults[5][1]:] = np.tile(
+        node_mask[offsets_defaults[5][0]:],
+        (N_LOCAL_LINKS + N_CROSS_LINKS, 1)).transpose().reshape(
+            (len(link_mask) - offsets_defaults[5][1]))
+
+    return link_mask
+
+
+def get_link8(link_scores_raw, map_size):
+    # link[i-1] -local- start -16- end -cross- link[i]
+    link8_mask = np.zeros((link_scores_raw.shape[0]))
+    for i in range(N_DET_LAYERS):
+        if i == 0:
+            offsets_start = map_size[i][0] * map_size[i][1] * N_LOCAL_LINKS
+            offsets_end = map_size[i][0] * map_size[i][1] * (
+                N_LOCAL_LINKS + 16)
+            offsets_link = map_size[i][0] * map_size[i][1] * (
+                N_LOCAL_LINKS + 16)
+            link8_mask[:offsets_start] = 1
+        else:
+            offsets_start = offsets_link + map_size[i][0] * map_size[i][
+                1] * N_LOCAL_LINKS
+            offsets_end = offsets_link + map_size[i][0] * map_size[i][1] * (
+                N_LOCAL_LINKS + 16)
+            offsets_link_pre = offsets_link
+            offsets_link += map_size[i][0] * map_size[i][1] * (
+                N_LOCAL_LINKS + 16 + N_CROSS_LINKS)
+            link8_mask[offsets_link_pre:offsets_start] = 1
+            link8_mask[offsets_end:offsets_link] = 1
+    return link_scores_raw[np.where(link8_mask > 0)[0], :]
+
+
+def decode_image_by_mutex(node_scores, link_scores, node_threshold,
+                          link_threshold, map_size, offsets_defaults):
+    node_mask = node_scores[:, POS_LABEL] >= node_threshold
+    link_pos = link_scores[:, POS_LABEL]
+    link_mut = link_scores[:, MUT_LABEL]
+    link_max = np.max(np.vstack((link_pos, link_mut)), axis=0)
+
+    offsets_pos_list = np.where(node_mask == 1)[0].tolist()
+
+    link_mask_th = link_max >= link_threshold
+    link_mask = get_link_mask(node_mask, offsets_defaults, link_max)
+    offsets_link_max = np.argsort(-(link_max * link_mask * link_mask_th))
+    offsets_link_max = offsets_link_max[:len(offsets_pos_list) * 8]
+
+    group_mask = np.zeros_like(node_mask, dtype=np.int32) - 1
+    mutex_mask = len(node_mask) * [[]]
+
+    def find_parent(point):
+        return group_mask[point]
+
+    def set_parent(point, parent):
+        group_mask[point] = parent
+
+    def set_mutex_constraint(point, mutex_point_list):
+        mutex_mask[point] = mutex_point_list
+
+    def find_mutex_constraint(point):
+        mutex_point_list = mutex_mask[point]
+        # update mutex_point_list
+        mutex_point_list_new = []
+        if not mutex_point_list == []:
+            for mutex_point in mutex_point_list:
+                if not is_root(mutex_point):
+                    mutex_point = find_root(mutex_point)
+                if mutex_point not in mutex_point_list_new:
+                    mutex_point_list_new.append(mutex_point)
+        set_mutex_constraint(point, mutex_point_list_new)
+        return mutex_point_list_new
+
+    def combine_mutex_constraint(point, parent):
+        mutex_point_list = find_mutex_constraint(point)
+        mutex_parent_list = find_mutex_constraint(parent)
+        for mutex_point in mutex_point_list:
+            if not is_root(mutex_point):
+                mutex_point = find_root(mutex_point)
+            if mutex_point not in mutex_parent_list:
+                mutex_parent_list.append(mutex_point)
+        set_mutex_constraint(parent, mutex_parent_list)
+
+    def add_mutex_constraint(p1, p2):
+        mutex_point_list1 = find_mutex_constraint(p1)
+        mutex_point_list2 = find_mutex_constraint(p2)
+
+        if p1 not in mutex_point_list2:
+            mutex_point_list2.append(p1)
+        if p2 not in mutex_point_list1:
+            mutex_point_list1.append(p2)
+        set_mutex_constraint(p1, mutex_point_list1)
+        set_mutex_constraint(p2, mutex_point_list2)
+
+    def is_root(point):
+        return find_parent(point) == -1
+
+    def find_root(point):
+        root = point
+        update_parent = False
+        while not is_root(root):
+            root = find_parent(root)
+            update_parent = True
+
+        # for acceleration of find_root
+        if update_parent:
+            set_parent(point, root)
+
+        return root
+
+    def join(p1, p2):
+        root1 = find_root(p1)
+        root2 = find_root(p2)
+
+        if root1 != root2 and (root1 not in find_mutex_constraint(root2)):
+            set_parent(root1, root2)
+            combine_mutex_constraint(root1, root2)
+
+    def disjoin(p1, p2):
+        root1 = find_root(p1)
+        root2 = find_root(p2)
+
+        if root1 != root2:
+            add_mutex_constraint(root1, root2)
+
+    def get_all():
+        root_map = {}
+
+        def get_index(root):
+            if root not in root_map:
+                root_map[root] = len(root_map) + 1
+            return root_map[root]
+
+        mask = np.zeros_like(node_mask, dtype=np.int32)
+        for _, point in enumerate(offsets_pos_list):
+            point_root = find_root(point)
+            bbox_idx = get_index(point_root)
+            mask[point] = bbox_idx
+        return mask
+
+    # join by link
+    pos_link = 0
+    mut_link = 0
+    for _, offsets_link in enumerate(offsets_link_max):
+        l_idx, x, y, link_idx = get_coord_link(offsets_link, map_size,
+                                               offsets_defaults)
+        offsets = offsets_defaults[l_idx][0] + map_size[l_idx][1] * y + x
+        if offsets in offsets_pos_list:
+            neighbours = get_neighbours(l_idx, x, y, map_size,
+                                        offsets_defaults)
+            if not len(np.where(np.array(neighbours)[:,
+                                                     2] == link_idx)[0]) == 0:
+                noffsets = neighbours[np.where(
+                    np.array(neighbours)[:, 2] == link_idx)[0][0]]
+                link_pos_value = link_pos[noffsets[1]]
+                link_mut_value = link_mut[noffsets[1]]
+                node_cls = node_mask[noffsets[0]]
+                if node_cls and (link_pos_value > link_mut_value):
+                    pos_link += 1
+                    join(offsets, noffsets[0])
+                elif node_cls and (link_pos_value < link_mut_value):
+                    mut_link += 1
+                    disjoin(offsets, noffsets[0])
+
+    mask = get_all()
+    return mask
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
new file mode 100644
index 00000000..6371d4e5
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py
@@ -0,0 +1,432 @@
+"""Contains definitions for the original form of Residual Networks.
+The 'v1' residual networks (ResNets) implemented in this module were proposed
+by:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+Other variants were introduced in:
+[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
+The networks defined in this module utilize the bottleneck building block of
+[1] with projection shortcuts only for increasing depths. They employ batch
+normalization *after* every weight layer. This is the architecture used by
+MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
+ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
+architecture and the alternative 'v2' architecture of [2] which uses batch
+normalization *before* every weight layer in the so-called full pre-activation
+units.
+Typical use:
+   from tensorflow.contrib.slim.nets import resnet_v1
+ResNet-101 for image classification into 1000 classes:
+   # inputs has shape [batch, 224, 224, 3]
+   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
+      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
+ResNet-101 for semantic segmentation into 21 classes:
+   # inputs has shape [batch, 513, 513, 3]
+   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
+      net, end_points = resnet_v1.resnet_v1_101(inputs,
+                                                21,
+                                                is_training=False,
+                                                global_pool=False,
+                                                output_stride=16)
+"""
+import tensorflow as tf
+import tf_slim as slim
+
+from . import resnet_utils
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
+resnet_arg_scope = resnet_utils.resnet_arg_scope
+
+
+@slim.add_arg_scope
+def basicblock(inputs,
+               depth,
+               depth_bottleneck,
+               stride,
+               rate=1,
+               outputs_collections=None,
+               scope=None):
+    """Bottleneck residual unit variant with BN after convolutions.
+    This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
+    its definition. Note that we use here the bottleneck variant which has an
+    extra bottleneck layer.
+    When putting together two consecutive ResNet blocks that use this unit, one
+    should use stride = 2 in the last unit of the first block.
+    Args:
+      inputs: A tensor of size [batch, height, width, channels].
+      depth: The depth of the ResNet unit output.
+      depth_bottleneck: The depth of the bottleneck layers.
+      stride: The ResNet unit's stride. Determines the amount of downsampling of
+        the units output compared to its input.
+      rate: An integer, rate for atrous convolution.
+      outputs_collections: Collection to add the ResNet unit output.
+      scope: Optional variable_scope.
+    Returns:
+      The ResNet unit's output.
+    """
+    with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
+        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
+        if depth == depth_in:
+            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
+        else:
+            shortcut = slim.conv2d(
+                inputs,
+                depth, [1, 1],
+                stride=stride,
+                activation_fn=None,
+                scope='shortcut')
+
+        residual = resnet_utils.conv2d_same(
+            inputs, depth, 3, stride, rate=rate, scope='conv1')
+        residual = resnet_utils.conv2d_same(
+            residual, depth, 3, 1, rate=rate, scope='conv2')
+
+        output = tf.nn.relu(residual + shortcut)
+
+        return slim.utils.collect_named_outputs(outputs_collections,
+                                                sc.original_name_scope, output)
+
+
+@slim.add_arg_scope
+def bottleneck(inputs,
+               depth,
+               depth_bottleneck,
+               stride,
+               rate=1,
+               outputs_collections=None,
+               scope=None):
+    """Bottleneck residual unit variant with BN after convolutions.
+    This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
+    its definition. Note that we use here the bottleneck variant which has an
+    extra bottleneck layer.
+    When putting together two consecutive ResNet blocks that use this unit, one
+    should use stride = 2 in the last unit of the first block.
+    Args:
+      inputs: A tensor of size [batch, height, width, channels].
+      depth: The depth of the ResNet unit output.
+      depth_bottleneck: The depth of the bottleneck layers.
+      stride: The ResNet unit's stride. Determines the amount of downsampling of
+        the units output compared to its input.
+      rate: An integer, rate for atrous convolution.
+      outputs_collections: Collection to add the ResNet unit output.
+      scope: Optional variable_scope.
+    Returns:
+      The ResNet unit's output.
+    """
+    with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
+        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
+        if depth == depth_in:
+            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
+        else:
+            shortcut = slim.conv2d(
+                inputs,
+                depth, [1, 1],
+                stride=stride,
+                activation_fn=None,
+                scope='shortcut')
+
+        residual = slim.conv2d(
+            inputs, depth_bottleneck, [1, 1], stride=1, scope='conv1')
+        residual = resnet_utils.conv2d_same(
+            residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2')
+        residual = slim.conv2d(
+            residual,
+            depth, [1, 1],
+            stride=1,
+            activation_fn=None,
+            scope='conv3')
+
+        output = tf.nn.relu(shortcut + residual)
+
+        return slim.utils.collect_named_outputs(outputs_collections,
+                                                sc.original_name_scope, output)
+
+
+def resnet_v1(inputs,
+              blocks,
+              num_classes=None,
+              is_training=True,
+              global_pool=True,
+              output_stride=None,
+              include_root_block=True,
+              spatial_squeeze=True,
+              reuse=None,
+              scope=None):
+    """Generator for v1 ResNet models.
+    This function generates a family of ResNet v1 models. See the resnet_v1_*()
+    methods for specific model instantiations, obtained by selecting different
+    block instantiations that produce ResNets of various depths.
+    Training for image classification on Imagenet is usually done with [224, 224]
+    inputs, resulting in [7, 7] feature maps at the output of the last ResNet
+    block for the ResNets defined in [1] that have nominal stride equal to 32.
+    However, for dense prediction tasks we advise that one uses inputs with
+    spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
+    this case the feature maps at the ResNet output will have spatial shape
+    [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
+    and corners exactly aligned with the input image corners, which greatly
+    facilitates alignment of the features to the image. Using as input [225, 225]
+    images results in [8, 8] feature maps at the output of the last ResNet block.
+    For dense prediction tasks, the ResNet needs to run in fully-convolutional
+    (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
+    have nominal stride equal to 32 and a good choice in FCN mode is to use
+    output_stride=16 in order to increase the density of the computed features at
+    small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
+    Args:
+      inputs: A tensor of size [batch, height_in, width_in, channels].
+      blocks: A list of length equal to the number of ResNet blocks. Each element
+        is a resnet_utils.Block object describing the units in the block.
+      num_classes: Number of predicted classes for classification tasks. If None
+        we return the features before the logit layer.
+      is_training: whether is training or not.
+      global_pool: If True, we perform global average pooling before computing the
+        logits. Set to True for image classification, False for dense prediction.
+      output_stride: If None, then the output will be computed at the nominal
+        network stride. If output_stride is not None, it specifies the requested
+        ratio of input to output spatial resolution.
+      include_root_block: If True, include the initial convolution followed by
+        max-pooling, if False excludes it.
+      spatial_squeeze: if True, logits is of shape [B, C], if false logits is
+          of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+      reuse: whether or not the network and its variables should be reused. To be
+        able to reuse 'scope' must be given.
+      scope: Optional variable_scope.
+    Returns:
+      net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
+        If global_pool is False, then height_out and width_out are reduced by a
+        factor of output_stride compared to the respective height_in and width_in,
+        else both height_out and width_out equal one. If num_classes is None, then
+        net is the output of the last ResNet block, potentially after global
+        average pooling. If num_classes is not None, net contains the pre-softmax
+        activations.
+      end_points: A dictionary from components of the network to the corresponding
+        activation.
+    Raises:
+      ValueError: If the target output_stride is not valid.
+    """
+    with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
+        end_points_collection = sc.name + '_end_points'
+        with slim.arg_scope(
+            [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
+                outputs_collections=end_points_collection):
+            with slim.arg_scope([slim.batch_norm], is_training=is_training):
+                net = inputs
+                if include_root_block:
+                    if output_stride is not None:
+                        if output_stride % 4 != 0:
+                            raise ValueError(
+                                'The output_stride needs to be a multiple of 4.'
+                            )
+                        output_stride /= 4
+                    net = resnet_utils.conv2d_same(
+                        net, 64, 7, stride=2, scope='conv1')
+                    net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
+                    net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
+
+                    net = slim.utils.collect_named_outputs(
+                        end_points_collection, 'pool2', net)
+
+                net = resnet_utils.stack_blocks_dense(net, blocks,
+                                                      output_stride)
+
+                end_points = slim.utils.convert_collection_to_dict(
+                    end_points_collection)
+
+                end_points['pool1'] = end_points['resnet_v1_18/block2/unit_2']
+                end_points['pool2'] = end_points['resnet_v1_18/block3/unit_2']
+                end_points['pool3'] = end_points['resnet_v1_18/block4/unit_2']
+                end_points['pool4'] = end_points['resnet_v1_18/block5/unit_2']
+                end_points['pool5'] = end_points['resnet_v1_18/block6/unit_2']
+                end_points['pool6'] = net
+
+                return net, end_points
+
+
+resnet_v1.default_image_size = 224
+
+
+def resnet_v1_18(inputs,
+                 num_classes=None,
+                 is_training=True,
+                 global_pool=True,
+                 output_stride=None,
+                 spatial_squeeze=True,
+                 reuse=None,
+                 scope='resnet_v1_18'):
+    """ResNet-18 model of [1]. See resnet_v1() for arg and return description."""
+    blocks = [
+        resnet_utils.Block('block1', basicblock,
+                           [(64, 64, 1)] + [(64, 64, 1)]),
+        resnet_utils.Block('block2', basicblock,
+                           [(128, 128, 1)] + [(128, 128, 1)]),
+        resnet_utils.Block('block3', basicblock,
+                           [(256, 256, 2)] + [(256, 256, 1)]),
+        resnet_utils.Block('block4', basicblock,
+                           [(512, 512, 2)] + [(512, 512, 1)]),
+        resnet_utils.Block('block5', basicblock,
+                           [(256, 256, 2)] + [(256, 256, 1)]),
+        resnet_utils.Block('block6', basicblock,
+                           [(256, 256, 2)] + [(256, 256, 1)]),
+        resnet_utils.Block('block7', basicblock,
+                           [(256, 256, 2)] + [(256, 256, 1)]),
+    ]
+    return resnet_v1(
+        inputs,
+        blocks,
+        num_classes,
+        is_training,
+        global_pool=global_pool,
+        output_stride=output_stride,
+        include_root_block=True,
+        spatial_squeeze=spatial_squeeze,
+        reuse=reuse,
+        scope=scope)
+
+
+resnet_v1_18.default_image_size = resnet_v1.default_image_size
+
+
+def resnet_v1_50(inputs,
+                 num_classes=None,
+                 is_training=True,
+                 global_pool=True,
+                 output_stride=None,
+                 spatial_squeeze=True,
+                 reuse=None,
+                 scope='resnet_v1_50'):
+    """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
+    blocks = [
+        resnet_utils.Block('block1', bottleneck,
+                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
+        resnet_utils.Block('block2', bottleneck,
+                           [(512, 128, 1)] * 3 + [(512, 128, 2)]),
+        resnet_utils.Block('block3', bottleneck,
+                           [(1024, 256, 1)] * 5 + [(1024, 256, 2)]),
+        resnet_utils.Block('block4', bottleneck,
+                           [(2048, 512, 1)] * 3 + [(2048, 512, 2)]),
+        resnet_utils.Block('block5', bottleneck,
+                           [(1024, 256, 1)] * 2 + [(1024, 256, 2)]),
+        resnet_utils.Block('block6', bottleneck, [(1024, 256, 1)] * 2),
+    ]
+    return resnet_v1(
+        inputs,
+        blocks,
+        num_classes,
+        is_training,
+        global_pool=global_pool,
+        output_stride=output_stride,
+        include_root_block=True,
+        spatial_squeeze=spatial_squeeze,
+        reuse=reuse,
+        scope=scope)
+
+
+resnet_v1_50.default_image_size = resnet_v1.default_image_size
+
+
+def resnet_v1_101(inputs,
+                  num_classes=None,
+                  is_training=True,
+                  global_pool=True,
+                  output_stride=None,
+                  spatial_squeeze=True,
+                  reuse=None,
+                  scope='resnet_v1_101'):
+    """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
+    blocks = [
+        resnet_utils.Block('block1', bottleneck,
+                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
+        resnet_utils.Block('block2', bottleneck,
+                           [(512, 128, 1)] * 3 + [(512, 128, 2)]),
+        resnet_utils.Block('block3', bottleneck,
+                           [(1024, 256, 1)] * 22 + [(1024, 256, 2)]),
+        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+    ]
+    return resnet_v1(
+        inputs,
+        blocks,
+        num_classes,
+        is_training,
+        global_pool=global_pool,
+        output_stride=output_stride,
+        include_root_block=True,
+        spatial_squeeze=spatial_squeeze,
+        reuse=reuse,
+        scope=scope)
+
+
+resnet_v1_101.default_image_size = resnet_v1.default_image_size
+
+
+def resnet_v1_152(inputs,
+                  num_classes=None,
+                  is_training=True,
+                  global_pool=True,
+                  output_stride=None,
+                  spatial_squeeze=True,
+                  reuse=None,
+                  scope='resnet_v1_152'):
+    """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
+    blocks = [
+        resnet_utils.Block('block1', bottleneck,
+                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
+        resnet_utils.Block('block2', bottleneck,
+                           [(512, 128, 1)] * 7 + [(512, 128, 2)]),
+        resnet_utils.Block('block3', bottleneck,
+                           [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
+        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+    ]
+    return resnet_v1(
+        inputs,
+        blocks,
+        num_classes,
+        is_training,
+        global_pool=global_pool,
+        output_stride=output_stride,
+        include_root_block=True,
+        spatial_squeeze=spatial_squeeze,
+        reuse=reuse,
+        scope=scope)
+
+
+resnet_v1_152.default_image_size = resnet_v1.default_image_size
+
+
+def resnet_v1_200(inputs,
+                  num_classes=None,
+                  is_training=True,
+                  global_pool=True,
+                  output_stride=None,
+                  spatial_squeeze=True,
+                  reuse=None,
+                  scope='resnet_v1_200'):
+    """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
+    blocks = [
+        resnet_utils.Block('block1', bottleneck,
+                           [(256, 64, 1)] * 2 + [(256, 64, 2)]),
+        resnet_utils.Block('block2', bottleneck,
+                           [(512, 128, 1)] * 23 + [(512, 128, 2)]),
+        resnet_utils.Block('block3', bottleneck,
+                           [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
+        resnet_utils.Block('block4', bottleneck, [(2048, 512, 1)] * 3)
+    ]
+    return resnet_v1(
+        inputs,
+        blocks,
+        num_classes,
+        is_training,
+        global_pool=global_pool,
+        output_stride=output_stride,
+        include_root_block=True,
+        spatial_squeeze=spatial_squeeze,
+        reuse=reuse,
+        scope=scope)
+
+
+resnet_v1_200.default_image_size = resnet_v1.default_image_size
+
+if __name__ == '__main__':
+    input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input')
+    with slim.arg_scope(resnet_arg_scope()) as sc:
+        logits = resnet_v1_50(input)
diff --git a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
new file mode 100644
index 00000000..e0e240c8
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py
@@ -0,0 +1,231 @@
+"""Contains building blocks for various versions of Residual Networks.
+Residual networks (ResNets) were proposed in:
+  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+  Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015
+More variants were introduced in:
+  Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+  Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016
+We can obtain different ResNet variants by changing the network depth, width,
+and form of residual unit. This module implements the infrastructure for
+building them. Concrete ResNet units and full ResNet networks are implemented in
+the accompanying resnet_v1.py and resnet_v2.py modules.
+Compared to https://github.com/KaimingHe/deep-residual-networks, in the current
+implementation we subsample the output activations in the last residual unit of
+each block, instead of subsampling the input activations in the first residual
+unit of each block. The two implementations give identical results but our
+implementation is more memory efficient.
+"""
+
+import collections
+
+import tensorflow as tf
+import tf_slim as slim
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
+
+class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])):
+    """A named tuple describing a ResNet block.
+    Its parts are:
+      scope: The scope of the `Block`.
+      unit_fn: The ResNet unit function which takes as input a `Tensor` and
+        returns another `Tensor` with the output of the ResNet unit.
+      args: A list of length equal to the number of units in the `Block`. The list
+        contains one (depth, depth_bottleneck, stride) tuple for each unit in the
+        block to serve as argument to unit_fn.
+    """
+
+
+def subsample(inputs, factor, scope=None):
+    """Subsamples the input along the spatial dimensions.
+    Args:
+      inputs: A `Tensor` of size [batch, height_in, width_in, channels].
+      factor: The subsampling factor.
+      scope: Optional variable_scope.
+    Returns:
+      output: A `Tensor` of size [batch, height_out, width_out, channels] with the
+        input, either intact (if factor == 1) or subsampled (if factor > 1).
+    """
+    if factor == 1:
+        return inputs
+    else:
+        return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope)
+
+
+def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None):
+    """Strided 2-D convolution with 'SAME' padding.
+    When stride > 1, then we do explicit zero-padding, followed by conv2d with
+    'VALID' padding.
+    Note that
+       net = conv2d_same(inputs, num_outputs, 3, stride=stride)
+    is equivalent to
+       net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME')
+       net = subsample(net, factor=stride)
+    whereas
+       net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME')
+    is different when the input's height or width is even, which is why we add the
+    current function. For more details, see ResnetUtilsTest.testConv2DSameEven().
+    Args:
+      inputs: A 4-D tensor of size [batch, height_in, width_in, channels].
+      num_outputs: An integer, the number of output filters.
+      kernel_size: An int with the kernel_size of the filters.
+      stride: An integer, the output stride.
+      rate: An integer, rate for atrous convolution.
+      scope: Scope.
+    Returns:
+      output: A 4-D tensor of size [batch, height_out, width_out, channels] with
+        the convolution output.
+    """
+    if stride == 1:
+        return slim.conv2d(
+            inputs,
+            num_outputs,
+            kernel_size,
+            stride=1,
+            rate=rate,
+            padding='SAME',
+            scope=scope)
+    else:
+        kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1)
+        pad_total = kernel_size_effective - 1
+        pad_beg = pad_total // 2
+        pad_end = pad_total - pad_beg
+        inputs = tf.pad(
+            inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
+        return slim.conv2d(
+            inputs,
+            num_outputs,
+            kernel_size,
+            stride=stride,
+            rate=rate,
+            padding='VALID',
+            scope=scope)
+
+
+@slim.add_arg_scope
+def stack_blocks_dense(net,
+                       blocks,
+                       output_stride=None,
+                       outputs_collections=None):
+    """Stacks ResNet `Blocks` and controls output feature density.
+    First, this function creates scopes for the ResNet in the form of
+    'block_name/unit_1', 'block_name/unit_2', etc.
+    Second, this function allows the user to explicitly control the ResNet
+    output_stride, which is the ratio of the input to output spatial resolution.
+    This is useful for dense prediction tasks such as semantic segmentation or
+    object detection.
+    Most ResNets consist of 4 ResNet blocks and subsample the activations by a
+    factor of 2 when transitioning between consecutive ResNet blocks. This results
+    to a nominal ResNet output_stride equal to 8. If we set the output_stride to
+    half the nominal network stride (e.g., output_stride=4), then we compute
+    responses twice.
+    Control of the output feature density is implemented by atrous convolution.
+    Args:
+      net: A `Tensor` of size [batch, height, width, channels].
+      blocks: A list of length equal to the number of ResNet `Blocks`. Each
+        element is a ResNet `Block` object describing the units in the `Block`.
+      output_stride: If `None`, then the output will be computed at the nominal
+        network stride. If output_stride is not `None`, it specifies the requested
+        ratio of input to output spatial resolution, which needs to be equal to
+        the product of unit strides from the start up to some level of the ResNet.
+        For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1,
+        then valid values for the output_stride are 1, 2, 6, 24 or None (which
+        is equivalent to output_stride=24).
+      outputs_collections: Collection to add the ResNet block outputs.
+    Returns:
+      net: Output tensor with stride equal to the specified output_stride.
+    Raises:
+      ValueError: If the target output_stride is not valid.
+    """
+    # The current_stride variable keeps track of the effective stride of the
+    # activations. This allows us to invoke atrous convolution whenever applying
+    # the next residual unit would result in the activations having stride larger
+    # than the target output_stride.
+    current_stride = 1
+
+    # The atrous convolution rate parameter.
+    rate = 1
+
+    for block in blocks:
+        with tf.variable_scope(block.scope, 'block', [net]):
+            for i, unit in enumerate(block.args):
+                if output_stride is not None and current_stride > output_stride:
+                    raise ValueError(
+                        'The target output_stride cannot be reached.')
+
+                with tf.variable_scope(
+                        'unit_%d' % (i + 1), values=[net]) as sc:
+                    unit_depth, unit_depth_bottleneck, unit_stride = unit
+                    # If we have reached the target output_stride, then we need to employ
+                    # atrous convolution with stride=1 and multiply the atrous rate by the
+                    # current unit's stride for use in subsequent layers.
+                    if output_stride is not None and current_stride == output_stride:
+                        net = block.unit_fn(
+                            net,
+                            depth=unit_depth,
+                            depth_bottleneck=unit_depth_bottleneck,
+                            stride=1,
+                            rate=rate)
+                        rate *= unit_stride
+
+                    else:
+                        net = block.unit_fn(
+                            net,
+                            depth=unit_depth,
+                            depth_bottleneck=unit_depth_bottleneck,
+                            stride=unit_stride,
+                            rate=1)
+                        current_stride *= unit_stride
+                    net = slim.utils.collect_named_outputs(
+                        outputs_collections, sc.name, net)
+
+    if output_stride is not None and current_stride != output_stride:
+        raise ValueError('The target output_stride cannot be reached.')
+
+    return net
+
+
+def resnet_arg_scope(weight_decay=0.0001,
+                     batch_norm_decay=0.997,
+                     batch_norm_epsilon=1e-5,
+                     batch_norm_scale=True):
+    """Defines the default ResNet arg scope.
+    TODO(gpapan): The batch-normalization related default values above are
+      appropriate for use in conjunction with the reference ResNet models
+      released at https://github.com/KaimingHe/deep-residual-networks. When
+      training ResNets from scratch, they might need to be tuned.
+    Args:
+      weight_decay: The weight decay to use for regularizing the model.
+      batch_norm_decay: The moving average decay when estimating layer activation
+        statistics in batch normalization.
+      batch_norm_epsilon: Small constant to prevent division by zero when
+        normalizing activations by their variance in batch normalization.
+      batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the
+        activations in the batch normalization layer.
+    Returns:
+      An `arg_scope` to use for the resnet models.
+    """
+    batch_norm_params = {
+        'decay': batch_norm_decay,
+        'epsilon': batch_norm_epsilon,
+        'scale': batch_norm_scale,
+        'updates_collections': tf.GraphKeys.UPDATE_OPS,
+    }
+
+    with slim.arg_scope(
+        [slim.conv2d],
+            weights_regularizer=slim.l2_regularizer(weight_decay),
+            weights_initializer=slim.variance_scaling_initializer(),
+            activation_fn=tf.nn.relu,
+            normalizer_fn=slim.batch_norm,
+            normalizer_params=batch_norm_params):
+        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+            # The following implies padding='SAME' for pool1, which makes feature
+            # alignment easier for dense prediction tasks. This is also used in
+            # https://github.com/facebook/fb.resnet.torch. However the accompanying
+            # code of 'Deep Residual Learning for Image Recognition' uses
+            # padding='VALID' for pool1. You can switch to that choice by setting
+            # slim.arg_scope([slim.max_pool2d], padding='VALID').
+            with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
+                return arg_sc
diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py
new file mode 100644
index 00000000..be8e3371
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -0,0 +1,108 @@
+import cv2
+import numpy as np
+
+
+def rboxes_to_polygons(rboxes):
+    """
+    Convert rboxes to polygons
+    ARGS
+        `rboxes`: [n, 5]
+    RETURN
+        `polygons`: [n, 8]
+    """
+
+    theta = rboxes[:, 4:5]
+    cxcy = rboxes[:, :2]
+    half_w = rboxes[:, 2:3] / 2.
+    half_h = rboxes[:, 3:4] / 2.
+    v1 = np.hstack([np.cos(theta) * half_w, np.sin(theta) * half_w])
+    v2 = np.hstack([-np.sin(theta) * half_h, np.cos(theta) * half_h])
+    p1 = cxcy - v1 - v2
+    p2 = cxcy + v1 - v2
+    p3 = cxcy + v1 + v2
+    p4 = cxcy - v1 + v2
+    polygons = np.hstack([p1, p2, p3, p4])
+    return polygons
+
+
+def cal_width(box):
+    pd1 = point_dist(box[0], box[1], box[2], box[3])
+    pd2 = point_dist(box[4], box[5], box[6], box[7])
+    return (pd1 + pd2) / 2
+
+
+def point_dist(x1, y1, x2, y2):
+    return np.sqrt((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1))
+
+
+def draw_polygons(img, polygons):
+    for p in polygons.tolist():
+        p = [int(o) for o in p]
+        cv2.line(img, (p[0], p[1]), (p[2], p[3]), (0, 255, 0), 1)
+        cv2.line(img, (p[2], p[3]), (p[4], p[5]), (0, 255, 0), 1)
+        cv2.line(img, (p[4], p[5]), (p[6], p[7]), (0, 255, 0), 1)
+        cv2.line(img, (p[6], p[7]), (p[0], p[1]), (0, 255, 0), 1)
+    return img
+
+
+def nms_python(boxes):
+    boxes = sorted(boxes, key=lambda x: -x[8])
+    nms_flag = [True] * len(boxes)
+    for i, a in enumerate(boxes):
+        if not nms_flag[i]:
+            continue
+        else:
+            for j, b in enumerate(boxes):
+                if not j > i:
+                    continue
+                if not nms_flag[j]:
+                    continue
+                score_a = a[8]
+                score_b = b[8]
+                rbox_a = polygon2rbox(a[:8])
+                rbox_b = polygon2rbox(b[:8])
+                if point_in_rbox(rbox_a[:2], rbox_b) or point_in_rbox(
+                        rbox_b[:2], rbox_a):
+                    if score_a > score_b:
+                        nms_flag[j] = False
+    boxes_nms = []
+    for i, box in enumerate(boxes):
+        if nms_flag[i]:
+            boxes_nms.append(box)
+    return boxes_nms
+
+
+def point_in_rbox(c, rbox):
+    cx0, cy0 = c[0], c[1]
+    cx1, cy1 = rbox[0], rbox[1]
+    w, h = rbox[2], rbox[3]
+    theta = rbox[4]
+    dist_x = np.abs((cx1 - cx0) * np.cos(theta) + (cy1 - cy0) * np.sin(theta))
+    dist_y = np.abs(-(cx1 - cx0) * np.sin(theta) + (cy1 - cy0) * np.cos(theta))
+    return ((dist_x < w / 2.0) and (dist_y < h / 2.0))
+
+
+def polygon2rbox(polygon):
+    x1, x2, x3, x4 = polygon[0], polygon[2], polygon[4], polygon[6]
+    y1, y2, y3, y4 = polygon[1], polygon[3], polygon[5], polygon[7]
+    c_x = (x1 + x2 + x3 + x4) / 4
+    c_y = (y1 + y2 + y3 + y4) / 4
+    w1 = point_dist(x1, y1, x2, y2)
+    w2 = point_dist(x3, y3, x4, y4)
+    h1 = point_line_dist(c_x, c_y, x1, y1, x2, y2)
+    h2 = point_line_dist(c_x, c_y, x3, y3, x4, y4)
+    h = h1 + h2
+    w = (w1 + w2) / 2
+    theta1 = np.arctan2(y2 - y1, x2 - x1)
+    theta2 = np.arctan2(y3 - y4, x3 - x4)
+    theta = (theta1 + theta2) / 2.0
+    return [c_x, c_y, w, h, theta]
+
+
+def point_line_dist(px, py, x1, y1, x2, y2):
+    eps = 1e-6
+    dx = x2 - x1
+    dy = y2 - y1
+    div = np.sqrt(dx * dx + dy * dy) + eps
+    dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
+    return dist
diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py
index b1ee121c..b7402b93 100644
--- a/modelscope/pipelines/multi_modal/__init__.py
+++ b/modelscope/pipelines/multi_modal/__init__.py
@@ -1 +1 @@
-from .image_caption_pipeline import ImageCaptionPipeline
+from .image_captioning_pipeline import ImageCaptionPipeline
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
new file mode 100644
index 00000000..9f32caf4
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -0,0 +1,35 @@
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.preprocessors import OfaImageCaptionPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from ..base import Model, Pipeline
+from ..builder import PIPELINES
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_captioning, module_name=Pipelines.image_caption)
+class ImageCaptionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: [Preprocessor] = None,
+                 **kwargs):
+        super().__init__()
+        assert isinstance(model, str) or isinstance(model, Model), \
+            'model must be a single str or OfaForImageCaptioning'
+        if isinstance(model, str):
+            pipe_model = Model.from_pretrained(model)
+        elif isinstance(model, Model):
+            pipe_model = model
+        else:
+            raise NotImplementedError
+        if preprocessor is None and pipe_model:
+            preprocessor = OfaImageCaptionPreprocessor(model_dir=model)
+        super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index adfa1d4c..df8dbbd9 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -1,7 +1,10 @@
+from .dialog_intent_prediction_pipeline import *  # noqa F403
+from .dialog_modeling_pipeline import *  # noqa F403
+from .dialog_state_tracking import *  # noqa F403
+from .fill_mask_pipeline import *  # noqa F403
+from .nli_pipeline import *  # noqa F403
 from .sentence_similarity_pipeline import *  # noqa F403
+from .sentiment_classification_pipeline import *  # noqa F403
 from .sequence_classification_pipeline import *  # noqa F403
-from .space.dialog_intent_prediction_pipeline import *  # noqa F403
-from .space.dialog_modeling_pipeline import *  # noqa F403
-from .space.dialog_state_tracking import *  # noqa F403
 from .text_generation_pipeline import *  # noqa F403
 from .word_segmentation_pipeline import *  # noqa F403
diff --git a/modelscope/pipelines/nlp/space/dialog_intent_prediction_pipeline.py b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
similarity index 75%
rename from modelscope/pipelines/nlp/space/dialog_intent_prediction_pipeline.py
rename to modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
index 57245bdf..3fd38641 100644
--- a/modelscope/pipelines/nlp/space/dialog_intent_prediction_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
@@ -1,16 +1,18 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
-from modelscope.models.nlp import DialogIntentModel
-from modelscope.preprocessors import DialogIntentPredictionPreprocessor
-from modelscope.utils.constant import Tasks
-from ...base import Input, Pipeline
-from ...builder import PIPELINES
+from ...metainfo import Pipelines
+from ...models.nlp import DialogIntentModel
+from ...preprocessors import DialogIntentPredictionPreprocessor
+from ...utils.constant import Tasks
+from ..base import Pipeline
+from ..builder import PIPELINES
 
 __all__ = ['DialogIntentPredictionPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.dialog_intent_prediction, module_name=r'space-intent')
+    Tasks.dialog_intent_prediction,
+    module_name=Pipelines.dialog_intent_prediction)
 class DialogIntentPredictionPipeline(Pipeline):
 
     def __init__(self, model: DialogIntentModel,
diff --git a/modelscope/pipelines/nlp/space/dialog_modeling_pipeline.py b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
similarity index 89%
rename from modelscope/pipelines/nlp/space/dialog_modeling_pipeline.py
rename to modelscope/pipelines/nlp/dialog_modeling_pipeline.py
index afa352b6..778284de 100644
--- a/modelscope/pipelines/nlp/space/dialog_modeling_pipeline.py
+++ b/modelscope/pipelines/nlp/dialog_modeling_pipeline.py
@@ -3,14 +3,15 @@ from typing import Any, Dict, Optional
 from modelscope.models.nlp import DialogModelingModel
 from modelscope.preprocessors import DialogModelingPreprocessor
 from modelscope.utils.constant import Tasks
-from ...base import Pipeline, Tensor
-from ...builder import PIPELINES
+from ...metainfo import Pipelines
+from ..base import Pipeline, Tensor
+from ..builder import PIPELINES
 
 __all__ = ['DialogModelingPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.dialog_modeling, module_name=r'space-modeling')
+    Tasks.dialog_modeling, module_name=Pipelines.dialog_modeling)
 class DialogModelingPipeline(Pipeline):
 
     def __init__(self, model: DialogModelingModel,
diff --git a/modelscope/pipelines/nlp/dialog_state_tracking.py b/modelscope/pipelines/nlp/dialog_state_tracking.py
new file mode 100644
index 00000000..823248d2
--- /dev/null
+++ b/modelscope/pipelines/nlp/dialog_state_tracking.py
@@ -0,0 +1,45 @@
+from typing import Any, Dict
+
+from ...metainfo import Pipelines
+from ...models.nlp import DialogStateTrackingModel
+from ...preprocessors import DialogStateTrackingPreprocessor
+from ...utils.constant import Tasks
+from ..base import Pipeline
+from ..builder import PIPELINES
+
+__all__ = ['DialogStateTrackingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.dialog_state_tracking, module_name=Pipelines.dialog_state_tracking)
+class DialogStateTrackingPipeline(Pipeline):
+
+    def __init__(self, model: DialogStateTrackingModel,
+                 preprocessor: DialogStateTrackingPreprocessor, **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (SequenceClassificationModel): a model instance
+            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
+        """
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model = model
+        # self.tokenizer = preprocessor.tokenizer
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        import numpy as np
+        pred = inputs['pred']
+        pos = np.where(pred == np.max(pred))
+
+        result = {'pred': pred, 'label': pos[0]}
+
+        return result
diff --git a/modelscope/pipelines/nlp/fill_mask_pipeline.py b/modelscope/pipelines/nlp/fill_mask_pipeline.py
new file mode 100644
index 00000000..596d65f7
--- /dev/null
+++ b/modelscope/pipelines/nlp/fill_mask_pipeline.py
@@ -0,0 +1,107 @@
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp.masked_language_model import MaskedLanguageModelBase
+from ...preprocessors import FillMaskPreprocessor
+from ...utils.constant import Tasks
+from ..base import Pipeline, Tensor
+from ..builder import PIPELINES
+
+__all__ = ['FillMaskPipeline']
+
+
+@PIPELINES.register_module(Tasks.fill_mask, module_name=Pipelines.fill_mask)
+class FillMaskPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MaskedLanguageModelBase, str],
+                 preprocessor: Optional[FillMaskPreprocessor] = None,
+                 first_sequence='sentense',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp fill mask pipeline for prediction
+
+        Args:
+            model (MaskedLanguageModelBase): a model instance
+            preprocessor (FillMaskPreprocessor): a preprocessor instance
+        """
+        fill_mask_model = model if isinstance(
+            model, MaskedLanguageModelBase) else Model.from_pretrained(model)
+        assert fill_mask_model.config is not None
+
+        if preprocessor is None:
+            preprocessor = FillMaskPreprocessor(
+                fill_mask_model.model_dir,
+                first_sequence=first_sequence,
+                second_sequence=None)
+        fill_mask_model.eval()
+        super().__init__(
+            model=fill_mask_model, preprocessor=preprocessor, **kwargs)
+
+        self.preprocessor = preprocessor
+        self.tokenizer = preprocessor.tokenizer
+        self.mask_id = {'veco': 250001, 'sbert': 103}
+
+        self.rep_map = {
+            'sbert': {
+                '[unused0]': '',
+                '[PAD]': '',
+                '[unused1]': '',
+                r' +': ' ',
+                '[SEP]': '',
+                '[unused2]': '',
+                '[CLS]': '',
+                '[UNK]': ''
+            },
+            'veco': {
+                r' +': ' ',
+                '<mask>': '<q>',
+                '<pad>': '',
+                '<s>': '',
+                '</s>': '',
+                '<unk>': ' '
+            }
+        }
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        import numpy as np
+        logits = inputs['logits'].detach().numpy()
+        input_ids = inputs['input_ids'].detach().numpy()
+        pred_ids = np.argmax(logits, axis=-1)
+        model_type = self.model.config.model_type
+        rst_ids = np.where(input_ids == self.mask_id[model_type], pred_ids,
+                           input_ids)
+
+        def rep_tokens(string, rep_map):
+            for k, v in rep_map.items():
+                string = string.replace(k, v)
+            return string.strip()
+
+        pred_strings = []
+        for ids in rst_ids:  # batch
+            # TODO vocab size is not stable
+
+            if self.model.config.vocab_size == 21128:  # zh bert
+                pred_string = self.tokenizer.convert_ids_to_tokens(ids)
+                pred_string = ''.join(pred_string)
+            else:
+                pred_string = self.tokenizer.decode(ids)
+            pred_string = rep_tokens(pred_string, self.rep_map[model_type])
+            pred_strings.append(pred_string)
+
+        return {'text': pred_strings}
diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py
new file mode 100644
index 00000000..49dc330f
--- /dev/null
+++ b/modelscope/pipelines/nlp/nli_pipeline.py
@@ -0,0 +1,72 @@
+import uuid
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForNLI
+from ...preprocessors import NLIPreprocessor
+from ...utils.constant import Tasks
+from ..base import Pipeline
+from ..builder import PIPELINES
+
+__all__ = ['NLIPipeline']
+
+
+@PIPELINES.register_module(Tasks.nli, module_name=Pipelines.nli)
+class NLIPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[SbertForNLI, str],
+                 preprocessor: NLIPreprocessor = None,
+                 first_sequence='first_sequence',
+                 second_sequence='second_sequence',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (SbertForNLI): a model instance
+            preprocessor (NLIPreprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, SbertForNLI), \
+            'model must be a single str or SbertForNLI'
+        model = model if isinstance(
+            model, SbertForNLI) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = NLIPreprocessor(
+                model.model_dir,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        assert len(model.id2label) > 0
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        probs = inputs['probabilities'][0]
+        num_classes = probs.shape[0]
+        topk = min(topk, num_classes)
+        top_indices = np.argpartition(probs, -topk)[-topk:]
+        cls_ids = top_indices[np.argsort(probs[top_indices])]
+        probs = probs[cls_ids].tolist()
+
+        cls_names = [self.model.id2label[cid] for cid in cls_ids]
+
+        return {'scores': probs, 'labels': cls_names}
diff --git a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
index 1b630c10..f6bcd72e 100644
--- a/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
+++ b/modelscope/pipelines/nlp/sentence_similarity_pipeline.py
@@ -1,11 +1,13 @@
 from typing import Any, Dict, Union
 
 import numpy as np
+import torch
 
-from modelscope.models.nlp import SbertForSentenceSimilarity
-from modelscope.preprocessors import SequenceClassificationPreprocessor
-from modelscope.utils.constant import Tasks
+from ...metainfo import Pipelines
 from ...models import Model
+from ...models.nlp import SbertForSentenceSimilarity
+from ...preprocessors import SequenceClassificationPreprocessor
+from ...utils.constant import Tasks
 from ..base import Input, Pipeline
 from ..builder import PIPELINES
 
@@ -13,13 +15,14 @@ __all__ = ['SentenceSimilarityPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.sentence_similarity,
-    module_name=r'sbert-base-chinese-sentence-similarity')
+    Tasks.sentence_similarity, module_name=Pipelines.sentence_similarity)
 class SentenceSimilarityPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[SbertForSentenceSimilarity, str],
+                 model: Union[Model, str],
                  preprocessor: SequenceClassificationPreprocessor = None,
+                 first_sequence='first_sequence',
+                 second_sequence='second_sequence',
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp sentence similarity pipeline for prediction
 
@@ -35,14 +38,21 @@ class SentenceSimilarityPipeline(Pipeline):
         if preprocessor is None:
             preprocessor = SequenceClassificationPreprocessor(
                 sc_model.model_dir,
-                first_sequence='first_sequence',
-                second_sequence='second_sequence')
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
+        sc_model.eval()
         super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs)
 
         assert hasattr(self.model, 'id2label'), \
             'id2label map should be initalizaed in init function.'
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/sentiment_classification_pipeline.py b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
new file mode 100644
index 00000000..9291ed44
--- /dev/null
+++ b/modelscope/pipelines/nlp/sentiment_classification_pipeline.py
@@ -0,0 +1,77 @@
+import os
+import uuid
+from typing import Any, Dict, Union
+
+import json
+import numpy as np
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForSentimentClassification
+from ...preprocessors import SentimentClassificationPreprocessor
+from ...utils.constant import Tasks
+from ..base import Input, Pipeline
+from ..builder import PIPELINES
+
+__all__ = ['SentimentClassificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.sentiment_classification,
+    module_name=Pipelines.sentiment_classification)
+class SentimentClassificationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[SbertForSentimentClassification, str],
+                 preprocessor: SentimentClassificationPreprocessor = None,
+                 first_sequence='first_sequence',
+                 second_sequence='second_sequence',
+                 **kwargs):
+        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
+
+        Args:
+            model (SbertForSentimentClassification): a model instance
+            preprocessor (SentimentClassificationPreprocessor): a preprocessor instance
+        """
+        assert isinstance(model, str) or isinstance(model, SbertForSentimentClassification), \
+            'model must be a single str or SbertForSentimentClassification'
+        model = model if isinstance(
+            model,
+            SbertForSentimentClassification) else Model.from_pretrained(model)
+        if preprocessor is None:
+            preprocessor = SentimentClassificationPreprocessor(
+                model.model_dir,
+                first_sequence=first_sequence,
+                second_sequence=second_sequence)
+        model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        assert len(model.id2label) > 0
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = 5) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+
+        probs = inputs['probabilities'][0]
+        num_classes = probs.shape[0]
+        topk = min(topk, num_classes)
+        top_indices = np.argpartition(probs, -topk)[-topk:]
+        cls_ids = top_indices[np.argsort(probs[top_indices])]
+        probs = probs[cls_ids].tolist()
+
+        cls_names = [self.model.id2label[cid] for cid in cls_ids]
+
+        return {'scores': probs, 'labels': cls_names}
diff --git a/modelscope/pipelines/nlp/sequence_classification_pipeline.py b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
index 1dbe2efd..43c81d60 100644
--- a/modelscope/pipelines/nlp/sequence_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/sequence_classification_pipeline.py
@@ -2,6 +2,7 @@ from typing import Any, Dict, Union
 
 import numpy as np
 
+from modelscope.metainfo import Pipelines
 from modelscope.models.nlp import BertForSequenceClassification
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
@@ -13,7 +14,7 @@ __all__ = ['SequenceClassificationPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.text_classification, module_name=r'bert-sentiment-analysis')
+    Tasks.text_classification, module_name=Pipelines.sentiment_analysis)
 class SequenceClassificationPipeline(Pipeline):
 
     def __init__(self,
diff --git a/modelscope/pipelines/nlp/space/dialog_state_tracking.py b/modelscope/pipelines/nlp/space/dialog_state_tracking.py
deleted file mode 100644
index 4a943095..00000000
--- a/modelscope/pipelines/nlp/space/dialog_state_tracking.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import Any, Dict, Optional
-
-from modelscope.models.nlp import DialogModelingModel
-from modelscope.preprocessors import DialogModelingPreprocessor
-from modelscope.utils.constant import Tasks
-from ...base import Pipeline, Tensor
-from ...builder import PIPELINES
-
-__all__ = ['DialogStateTrackingPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.dialog_state_tracking, module_name=r'space-dst')
-class DialogStateTrackingPipeline(Pipeline):
-
-    def __init__(self, model: DialogModelingModel,
-                 preprocessor: DialogModelingPreprocessor, **kwargs):
-        """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction
-
-        Args:
-            model (SequenceClassificationModel): a model instance
-            preprocessor (SequenceClassificationPreprocessor): a preprocessor instance
-        """
-
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.model = model
-        self.preprocessor = preprocessor
-
-    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
-        """process the prediction results
-
-        Args:
-            inputs (Dict[str, Any]): _description_
-
-        Returns:
-            Dict[str, str]: the prediction results
-        """
-        sys_rsp = self.preprocessor.text_field.tokenizer.convert_ids_to_tokens(
-            inputs['resp'])
-        assert len(sys_rsp) > 2
-        sys_rsp = sys_rsp[1:len(sys_rsp) - 1]
-        # sys_rsp = self.preprocessor.text_field.tokenizer.
-
-        inputs['sys'] = sys_rsp
-
-        return inputs
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 881e7ea6..8f55cce0 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -1,16 +1,20 @@
-from typing import Dict, Optional, Union
+from typing import Any, Dict, Optional, Union
 
-from modelscope.models import Model
-from modelscope.models.nlp import PalmForTextGeneration
-from modelscope.preprocessors import TextGenerationPreprocessor
-from modelscope.utils.constant import Tasks
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import PalmForTextGeneration
+from ...preprocessors import TextGenerationPreprocessor
+from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 
 __all__ = ['TextGenerationPipeline']
 
 
-@PIPELINES.register_module(Tasks.text_generation, module_name=r'palm2.0')
+@PIPELINES.register_module(
+    Tasks.text_generation, module_name=Pipelines.text_generation)
 class TextGenerationPipeline(Pipeline):
 
     def __init__(self,
@@ -31,10 +35,17 @@ class TextGenerationPipeline(Pipeline):
                 model.tokenizer,
                 first_sequence='sentence',
                 second_sequence=None)
+        model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.tokenizer = model.tokenizer
 
-    def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, str]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/nlp/word_segmentation_pipeline.py b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
index 1cc08a38..9501efb7 100644
--- a/modelscope/pipelines/nlp/word_segmentation_pipeline.py
+++ b/modelscope/pipelines/nlp/word_segmentation_pipeline.py
@@ -1,9 +1,12 @@
 from typing import Any, Dict, Optional, Union
 
-from modelscope.models import Model
-from modelscope.models.nlp import StructBertForTokenClassification
-from modelscope.preprocessors import TokenClassifcationPreprocessor
-from modelscope.utils.constant import Tasks
+import torch
+
+from ...metainfo import Pipelines
+from ...models import Model
+from ...models.nlp import SbertForTokenClassification
+from ...preprocessors import TokenClassifcationPreprocessor
+from ...utils.constant import Tasks
 from ..base import Pipeline, Tensor
 from ..builder import PIPELINES
 
@@ -11,12 +14,11 @@ __all__ = ['WordSegmentationPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.word_segmentation,
-    module_name=r'structbert-chinese-word-segmentation')
+    Tasks.word_segmentation, module_name=Pipelines.word_segmentation)
 class WordSegmentationPipeline(Pipeline):
 
     def __init__(self,
-                 model: Union[StructBertForTokenClassification, str],
+                 model: Union[SbertForTokenClassification, str],
                  preprocessor: Optional[TokenClassifcationPreprocessor] = None,
                  **kwargs):
         """use `model` and `preprocessor` to create a nlp word segmentation pipeline for prediction
@@ -27,15 +29,23 @@ class WordSegmentationPipeline(Pipeline):
         """
         model = model if isinstance(
             model,
-            StructBertForTokenClassification) else Model.from_pretrained(model)
+            SbertForTokenClassification) else Model.from_pretrained(model)
         if preprocessor is None:
             preprocessor = TokenClassifcationPreprocessor(model.model_dir)
+        model.eval()
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.tokenizer = preprocessor.tokenizer
         self.config = model.config
+        assert len(self.config.id2label) > 0
         self.id2label = self.config.id2label
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **postprocess_params) -> Dict[str, str]:
         """process the prediction results
 
         Args:
diff --git a/modelscope/pipelines/outputs.py b/modelscope/pipelines/outputs.py
index 15d8a995..a950fa69 100644
--- a/modelscope/pipelines/outputs.py
+++ b/modelscope/pipelines/outputs.py
@@ -45,6 +45,12 @@ TASK_OUTPUTS = {
     Tasks.image_matting: ['output_png'],
     Tasks.image_generation: ['output_png'],
 
+    # action recognition result for single video
+    # {
+    #   "output_label": "abseiling"
+    # }
+    Tasks.action_recognition: ['output_label'],
+
     # pose estimation result for single sample
     # {
     #   "poses": np.array with shape [num_pose, num_keypoint, 3],
@@ -54,6 +60,13 @@ TASK_OUTPUTS = {
     # }
     Tasks.pose_estimation: ['poses', 'boxes'],
 
+    # ocr detection result for single sample
+    # {
+    #   "det_polygons": np.array with shape [num_text, 8], each box is
+    #       [x1, y1, x2, y2, x3, y3, x4, y4]
+    # }
+    Tasks.ocr_detection: ['det_polygons'],
+
     # ============ nlp tasks ===================
 
     # text classification result for single sample
@@ -69,6 +82,12 @@ TASK_OUTPUTS = {
     # }
     Tasks.text_generation: ['text'],
 
+    # fill mask result for single sample
+    # {
+    #   "text": "this is the text which masks filled by model."
+    # }
+    Tasks.fill_mask: ['text'],
+
     # word segmentation result for single sample
     # {
     #   "output": "今天 天气 不错 ， 适合 出去 游玩"
@@ -82,6 +101,20 @@ TASK_OUTPUTS = {
     #   }
     Tasks.sentence_similarity: ['scores', 'labels'],
 
+    # sentiment classification result for single sample
+    #   {
+    #       "labels": ["happy", "sad", "calm", "angry"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.sentiment_classification: ['scores', 'labels'],
+
+    # nli result for single sample
+    #   {
+    #       "labels": ["happy", "sad", "calm", "angry"],
+    #       "scores": [0.9, 0.1, 0.05, 0.05]
+    #   }
+    Tasks.nli: ['scores', 'labels'],
+
     # ============ audio tasks ===================
 
     # audio processed for single file in PCM format
diff --git a/modelscope/pipelines/util.py b/modelscope/pipelines/util.py
index 37c9c929..d034a7d4 100644
--- a/modelscope/pipelines/util.py
+++ b/modelscope/pipelines/util.py
@@ -2,8 +2,8 @@
 import os.path as osp
 from typing import List, Union
 
-from maas_hub.file_download import model_file_download
-
+from modelscope.hub.api import HubApi
+from modelscope.hub.file_download import model_file_download
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
 from modelscope.utils.logger import get_logger
@@ -20,31 +20,63 @@ def is_config_has_model(cfg_file):
         return False
 
 
-def is_model_name(model: Union[str, List]):
-    """ whether model is a valid modelhub path
+def is_official_hub_path(path: Union[str, List]):
+    """ Whether path is a official hub name or a valid local
+    path to official hub directory.
+    """
+
+    def is_official_hub_impl(path):
+        if osp.exists(path):
+            cfg_file = osp.join(path, ModelFile.CONFIGURATION)
+            return osp.exists(cfg_file)
+        else:
+            try:
+                _ = HubApi().get_model(path)
+                return True
+            except Exception:
+                return False
+
+    if isinstance(path, str):
+        return is_official_hub_impl(path)
+    else:
+        results = [is_official_hub_impl(m) for m in path]
+        all_true = all(results)
+        any_true = any(results)
+        if any_true and not all_true:
+            raise ValueError(
+                f'some model are hub address, some are not, model list: {path}'
+            )
+
+        return all_true
+
+
+def is_model(path: Union[str, List]):
+    """ whether path is a valid modelhub path and containing model config
     """
 
-    def is_model_name_impl(model):
-        if osp.exists(model):
-            cfg_file = osp.join(model, ModelFile.CONFIGURATION)
+    def is_modelhub_path_impl(path):
+        if osp.exists(path):
+            cfg_file = osp.join(path, ModelFile.CONFIGURATION)
             if osp.exists(cfg_file):
                 return is_config_has_model(cfg_file)
             else:
                 return False
         else:
             try:
-                cfg_file = model_file_download(model, ModelFile.CONFIGURATION)
+                cfg_file = model_file_download(path, ModelFile.CONFIGURATION)
                 return is_config_has_model(cfg_file)
             except Exception:
                 return False
 
-    if isinstance(model, str):
-        return is_model_name_impl(model)
+    if isinstance(path, str):
+        return is_modelhub_path_impl(path)
     else:
-        results = [is_model_name_impl(m) for m in model]
+        results = [is_modelhub_path_impl(m) for m in path]
         all_true = all(results)
         any_true = any(results)
         if any_true and not all_true:
-            raise ValueError('some model are hub address, some are not')
+            raise ValueError(
+                f'some models are hub address, some are not, model list: {path}'
+            )
 
         return all_true
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index 7b67507a..fe68173a 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -1,12 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from .audio import LinearAECAndFbank
+# from .audio import LinearAECAndFbank
 from .base import Preprocessor
-from .builder import PREPROCESSORS, build_preprocessor
+# from .builder import PREPROCESSORS, build_preprocessor
 from .common import Compose
-from .image import LoadImage, load_image
 from .nlp import *  # noqa F403
 from .space.dialog_intent_prediction_preprocessor import *  # noqa F403
 from .space.dialog_modeling_preprocessor import *  # noqa F403
 from .space.dialog_state_tracking_preprocessor import *  # noqa F403
-from .text_to_speech import *  # noqa F403
+
+# from .text_to_speech import *  # noqa F403
diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py
index 6bd8aed5..b2123fb7 100644
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -5,11 +5,12 @@ from typing import Dict, Union
 from PIL import Image, ImageOps
 
 from modelscope.fileio import File
+from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields
 from .builder import PREPROCESSORS
 
 
-@PREPROCESSORS.register_module(Fields.cv)
+@PREPROCESSORS.register_module(Fields.cv, Preprocessors.load_image)
 class LoadImage:
     """Load an image from file or url.
     Added or updated keys are "filename", "img", "img_shape",
diff --git a/modelscope/pipelines/multi_modal/image_caption_pipeline.py b/modelscope/preprocessors/multi_modal.py
similarity index 55%
rename from modelscope/pipelines/multi_modal/image_caption_pipeline.py
rename to modelscope/preprocessors/multi_modal.py
index 3e5f49d0..7c8f0fab 100644
--- a/modelscope/pipelines/multi_modal/image_caption_pipeline.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -1,32 +1,48 @@
-from typing import Any, Dict
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict, Union
 
 import numpy as np
 import torch
 from PIL import Image
 
-from modelscope.pipelines.base import Input
-from modelscope.preprocessors import load_image
-from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-from ..base import Pipeline
-from ..builder import PIPELINES
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Preprocessors
+from modelscope.utils.constant import Fields, ModelFile
+from modelscope.utils.type_assert import type_assert
+from .base import Preprocessor
+from .builder import PREPROCESSORS
+from .image import load_image
 
-logger = get_logger()
+__all__ = [
+    'OfaImageCaptionPreprocessor',
+]
 
 
-@PIPELINES.register_module(Tasks.image_captioning, module_name='ofa')
-class ImageCaptionPipeline(Pipeline):
-    # TODO: refine using modelhub
-    def __init__(self, model: str, bpe_dir: str):
-        super().__init__()
-        # turn on cuda if GPU is available
+@PREPROCESSORS.register_module(
+    Fields.multi_modal, module_name=Preprocessors.ofa_image_caption)
+class OfaImageCaptionPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+
+        if osp.exists(model_dir):
+            local_model_dir = model_dir
+        else:
+            local_model_dir = snapshot_download(model_dir)
+        local_model = osp.join(local_model_dir, ModelFile.TORCH_MODEL_FILE)
+        bpe_dir = local_model_dir
+
         from fairseq import checkpoint_utils, tasks, utils
         from ofa.tasks.mm_tasks import CaptionTask
 
         tasks.register_task('caption', CaptionTask)
-        use_cuda = False
-        # use fp16 only when GPU is available
-        use_fp16 = False
+
         overrides = {
             'bpe_dir': bpe_dir,
             'eval_cider': False,
@@ -35,21 +51,9 @@ class ImageCaptionPipeline(Pipeline):
             'no_repeat_ngram_size': 3,
             'seed': 7
         }
-        models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
-            utils.split_paths(model), arg_overrides=overrides)
-
-        # Move models to GPU
-        for model in models:
-            model.eval()
-            if use_cuda:
-                model.cuda()
-            if use_fp16:
-                model.half()
-            model.prepare_for_inference_(cfg)
-        self.models = models
-        # Initialize generator
-        self.generator = task.build_generator(models, cfg.generation)
-
+        model, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+            utils.split_paths(local_model), arg_overrides=overrides)
+        del model
         # Initialize transform
         from torchvision import transforms
         mean = [0.5, 0.5, 0.5]
@@ -69,7 +73,8 @@ class ImageCaptionPipeline(Pipeline):
         self.eos_item = torch.LongTensor([task.src_dict.eos()])
         self.pad_idx = task.src_dict.pad()
 
-    def preprocess(self, input: Input) -> Dict[str, Any]:
+    @type_assert(object, (str, tuple, Image.Image))
+    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
 
         def encode_text(text, length=None, append_bos=False, append_eos=False):
             s = self.task.tgt_dict.encode_line(
@@ -84,11 +89,11 @@ class ImageCaptionPipeline(Pipeline):
                 s = torch.cat([s, self.eos_item])
             return s
 
-        if isinstance(input, Image.Image):
-            patch_image = self.patch_resize_transform(input).unsqueeze(0)
+        if isinstance(data, Image.Image):
+            patch_image = self.patch_resize_transform(data).unsqueeze(0)
         else:
             patch_image = self.patch_resize_transform(
-                load_image(input)).unsqueeze(0)
+                load_image(data)).unsqueeze(0)
         patch_mask = torch.tensor([True])
         text = 'what does the image describe?'
         src_text = encode_text(
@@ -105,17 +110,3 @@ class ImageCaptionPipeline(Pipeline):
             }
         }
         return sample
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        from ofa.utils.eval_utils import eval_caption
-
-        results, _ = eval_caption(self.task, self.generator, self.models,
-                                  input)
-        return {
-            'image_id': results[0]['image_id'],
-            'caption': results[0]['caption']
-        }
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        # What should we do here ?
-        return inputs
diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py
index 9bcaa87c..f998da37 100644
--- a/modelscope/preprocessors/nlp.py
+++ b/modelscope/preprocessors/nlp.py
@@ -5,14 +5,17 @@ from typing import Any, Dict, Union
 
 from transformers import AutoTokenizer
 
-from modelscope.utils.constant import Fields, InputFields
-from modelscope.utils.type_assert import type_assert
+from ..metainfo import Models, Preprocessors
+from ..utils.constant import Fields, InputFields
+from ..utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 
 __all__ = [
     'Tokenize', 'SequenceClassificationPreprocessor',
-    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor'
+    'TextGenerationPreprocessor', 'TokenClassifcationPreprocessor',
+    'NLIPreprocessor', 'SentimentClassificationPreprocessor',
+    'FillMaskPreprocessor'
 ]
 
 
@@ -31,7 +34,141 @@ class Tokenize(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=r'bert-sequence-classification')
+    Fields.nlp, module_name=Preprocessors.nli_tokenizer)
+class NLIPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        from sofa import SbertTokenizer
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, tuple)
+    def __call__(self, data: tuple) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (tuple): [sentence1, sentence2]
+                sentence1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+                sentence2 (str): a sentence
+                    Example:
+                        'you are so beautiful.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        sentence1, sentence2 = data
+        new_data = {
+            self.first_sequence: sentence1,
+            self.second_sequence: sentence2
+        }
+        # preprocess the data for the model input
+
+        rst = {
+            'id': [],
+            'input_ids': [],
+            'attention_mask': [],
+            'token_type_ids': []
+        }
+
+        max_seq_length = self.sequence_length
+
+        text_a = new_data[self.first_sequence]
+        text_b = new_data[self.second_sequence]
+        feature = self.tokenizer(
+            text_a,
+            text_b,
+            padding=False,
+            truncation=True,
+            max_length=max_seq_length)
+
+        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
+        rst['input_ids'].append(feature['input_ids'])
+        rst['attention_mask'].append(feature['attention_mask'])
+        rst['token_type_ids'].append(feature['token_type_ids'])
+
+        return rst
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
+class SentimentClassificationPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+
+        super().__init__(*args, **kwargs)
+
+        from sofa import SbertTokenizer
+        self.model_dir: str = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+
+        new_data = {self.first_sequence: data}
+        # preprocess the data for the model input
+
+        rst = {
+            'id': [],
+            'input_ids': [],
+            'attention_mask': [],
+            'token_type_ids': []
+        }
+
+        max_seq_length = self.sequence_length
+
+        text_a = new_data[self.first_sequence]
+
+        text_b = new_data.get(self.second_sequence, None)
+        feature = self.tokenizer(
+            text_a,
+            text_b,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length)
+
+        rst['id'].append(new_data.get('id', str(uuid.uuid4())))
+        rst['input_ids'].append(feature['input_ids'])
+        rst['attention_mask'].append(feature['attention_mask'])
+        rst['token_type_ids'].append(feature['token_type_ids'])
+
+        return rst
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
 class SequenceClassificationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -53,12 +190,12 @@ class SequenceClassificationPreprocessor(Preprocessor):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
         print(f'this is the tokenzier {self.tokenizer}')
 
-    @type_assert(object, (str, tuple))
-    def __call__(self, data: Union[str, tuple]) -> Dict[str, Any]:
+    @type_assert(object, (str, tuple, Dict))
+    def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
         """process the raw input data
 
         Args:
-            data (str or tuple):
+            data (str or tuple, Dict):
             sentence1 (str): a sentence
                     Example:
                         'you are so handsome.'
@@ -70,22 +207,31 @@ class SequenceClassificationPreprocessor(Preprocessor):
                 sentence2 (str): a sentence
                     Example:
                         'you are so beautiful.'
+            or
+            {field1: field_value1, field2: field_value2}
+            field1 (str): field name, default 'first_sequence'
+            field_value1 (str): a sentence
+                    Example:
+                        'you are so handsome.'
+
+            field2 (str): field name, default 'second_sequence'
+            field_value2 (str): a sentence
+                Example:
+                    'you are so beautiful.'
 
         Returns:
             Dict[str, Any]: the preprocessed data
         """
-
-        if not isinstance(data, tuple):
-            data = (
-                data,
-                None,
-            )
-
-        sentence1, sentence2 = data
-        new_data = {
-            self.first_sequence: sentence1,
-            self.second_sequence: sentence2
-        }
+        if isinstance(data, str):
+            new_data = {self.first_sequence: data}
+        elif isinstance(data, tuple):
+            sentence1, sentence2 = data
+            new_data = {
+                self.first_sequence: sentence1,
+                self.second_sequence: sentence2
+            }
+        else:
+            new_data = data
 
         # preprocess the data for the model input
 
@@ -115,7 +261,8 @@ class SequenceClassificationPreprocessor(Preprocessor):
         return rst
 
 
-@PREPROCESSORS.register_module(Fields.nlp, module_name=r'palm2.0')
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer)
 class TextGenerationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
@@ -166,12 +313,66 @@ class TextGenerationPreprocessor(Preprocessor):
 
         rst['input_ids'].append(feature['input_ids'])
         rst['attention_mask'].append(feature['attention_mask'])
+        return {k: torch.tensor(v) for k, v in rst.items()}
+
+
+@PREPROCESSORS.register_module(Fields.nlp)
+class FillMaskPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """preprocess the data via the vocab.txt from the `model_dir` path
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+        from sofa.utils.backend import AutoTokenizer
+        self.model_dir = model_dir
+        self.first_sequence: str = kwargs.pop('first_sequence',
+                                              'first_sequence')
+        self.sequence_length = kwargs.pop('sequence_length', 128)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir, use_fast=False)
+
+    @type_assert(object, str)
+    def __call__(self, data: str) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data (str): a sentence
+                Example:
+                    'you are so handsome.'
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        import torch
+
+        new_data = {self.first_sequence: data}
+        # preprocess the data for the model input
+
+        rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
+
+        max_seq_length = self.sequence_length
+
+        text_a = new_data[self.first_sequence]
+        feature = self.tokenizer(
+            text_a,
+            padding='max_length',
+            truncation=True,
+            max_length=max_seq_length,
+            return_token_type_ids=True)
+
+        rst['input_ids'].append(feature['input_ids'])
+        rst['attention_mask'].append(feature['attention_mask'])
+        rst['token_type_ids'].append(feature['token_type_ids'])
 
         return {k: torch.tensor(v) for k, v in rst.items()}
 
 
 @PREPROCESSORS.register_module(
-    Fields.nlp, module_name=r'bert-token-classification')
+    Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
 class TokenClassifcationPreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -199,6 +400,7 @@ class TokenClassifcationPreprocessor(Preprocessor):
         Returns:
             Dict[str, Any]: the preprocessed data
         """
+
         # preprocess the data for the model input
 
         text = data.replace(' ', '').strip()
diff --git a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
index c5a6b34c..733abf24 100644
--- a/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_intent_prediction_preprocessor.py
@@ -3,13 +3,12 @@
 import os
 from typing import Any, Dict
 
-from modelscope.preprocessors.space.fields.intent_field import \
-    IntentBPETextField
-from modelscope.utils.config import Config
-from modelscope.utils.constant import Fields
-from modelscope.utils.type_assert import type_assert
+from ...utils.config import Config
+from ...utils.constant import Fields
+from ...utils.type_assert import type_assert
 from ..base import Preprocessor
 from ..builder import PREPROCESSORS
+from .fields.intent_field import IntentBPETextField
 
 __all__ = ['DialogIntentPredictionPreprocessor']
 
diff --git a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
index 5061ba35..b0758b40 100644
--- a/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
+++ b/modelscope/preprocessors/space/dialog_modeling_preprocessor.py
@@ -1,16 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-import uuid
-from typing import Any, Dict, Union
-
-from modelscope.preprocessors.space.fields.gen_field import \
-    MultiWOZBPETextField
-from modelscope.utils.config import Config
-from modelscope.utils.constant import Fields, InputFields
-from modelscope.utils.type_assert import type_assert
+from typing import Any, Dict
+
+from ...utils.config import Config
+from ...utils.constant import Fields
+from ...utils.type_assert import type_assert
 from ..base import Preprocessor
 from ..builder import PREPROCESSORS
+from .fields.gen_field import MultiWOZBPETextField
 
 __all__ = ['DialogModelingPreprocessor']
 
diff --git a/modelscope/preprocessors/space/fields/dst_processors.py b/modelscope/preprocessors/space/fields/dst_processors.py
index 6d888bff..c5c81f66 100644
--- a/modelscope/preprocessors/space/fields/dst_processors.py
+++ b/modelscope/preprocessors/space/fields/dst_processors.py
@@ -154,14 +154,16 @@ utter3 = {
     'User-2':
     'I am looking for an expensive indian restaurant in the area of centre.',
     'System-2':
-    'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant in the center of town. I can book a table for you, if you like.',
+    'Might I recommend Saffron Brasserie? That is an expensive Indian restaurant '
+    'in the center of town. I can book a table for you, if you like.',
     'Dialog_Act-2': {
         'Restaurant-Recommend': [['area', 'center of town'],
                                  ['food', 'Indian'],
                                  ['name', 'Saffron Brasserie'],
                                  ['pricerange', 'expensive']]
     },
-    'User-3': 'Sure thing, please book for 6 people at 19:30 on Saturday.'
+    'User-3':
+    'Sure thing, please book for 6 people at 19:30 on Saturday.'
 }
 
 history_states3 = [{}, {
@@ -346,7 +348,6 @@ history_states3 = [{}, {
 
 
 class DSTProcessor(object):
-
     ACTS_DICT = {
         'taxi-depart': 'taxi-departure',
         'taxi-dest': 'taxi-destination',
@@ -380,7 +381,8 @@ class DSTProcessor(object):
 
     def _convert_inputs_to_utterances(self, inputs: dict,
                                       history_states: list):
-        """This method is to generate the utterances with user, sys, dialog_acts and metadata, while metadata is from the history_states or the output from the inference pipline"""
+        """This method is to generate the utterances with user, sys, dialog_acts and metadata,
+         while metadata is from the history_states or the output from the inference pipline"""
 
         utterances = []
         user_inputs = []
@@ -427,8 +429,8 @@ class DSTProcessor(object):
             if isinstance(item, dict):
                 for a in item:
                     aa = a.lower().split('-')
-                    if aa[1] == 'inform' or aa[1] == 'recommend' or aa[
-                            1] == 'select' or aa[1] == 'book':
+                    if aa[1] == 'inform' or aa[1] == 'recommend' or \
+                            aa[1] == 'select' or aa[1] == 'book':
                         for i in item[a]:
                             s = i[0].lower()
                             v = i[1].lower().strip()
@@ -443,7 +445,7 @@ class DSTProcessor(object):
                             if key not in s_dict:
                                 s_dict[key] = list([v])
                             # ... Option 2: Keep last informed value
-                            #s_dict[key] = list([v])
+                            # s_dict[key] = list([v])
 
         return s_dict
 
@@ -454,26 +456,26 @@ class multiwoz22Processor(DSTProcessor):
         super().__init__()
 
     def normalize_time(self, text):
-        text = re.sub('(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2',
+        text = re.sub(r'(\d{1})(a\.?m\.?|p\.?m\.?)', r'\1 \2',
                       text)  # am/pm without space
-        text = re.sub('(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3',
+        text = re.sub(r'(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)', r'\1\2:00 \3',
                       text)  # am/pm short to long form
         text = re.sub(
-            '(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)',
+            r'(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)',
             r'\1\2 \3:\4\5', text)  # Missing separator
-        text = re.sub('(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3',
+        text = re.sub(r'(^| )(\d{2})[;.,](\d{2})', r'\1\2:\3',
                       text)  # Wrong separator
-        text = re.sub('(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)',
+        text = re.sub(r'(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)',
                       r'\1\2 \3:00\4', text)  # normalize simple full hour time
-        text = re.sub('(^| )(\d{1}:\d{2})', r'\g<1>0\2',
+        text = re.sub(r'(^| )(\d{1}:\d{2})', r'\g<1>0\2',
                       text)  # Add missing leading 0
         # Map 12 hour times to 24 hour times
-        text = re.sub(
-            '(\d{2})(:\d{2}) ?p\.?m\.?', lambda x: str(
-                int(x.groups()[0]) + 12
-                if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups(
-                )[1], text)
-        text = re.sub('(^| )24:(\d{2})', r'\g<1>00:\2',
+        text = \
+            re.sub(
+                r'(\d{2})(:\d{2}) ?p\.?m\.?',
+                lambda x: str(int(x.groups()[0]) + 12
+                              if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups()[1], text)
+        text = re.sub(r'(^| )24:(\d{2})', r'\g<1>00:\2',
                       text)  # Correct times that use 24 as hour
         return text
 
@@ -508,8 +510,8 @@ class multiwoz22Processor(DSTProcessor):
                 if isinstance(acts[d][t]['dialog_act'], dict):
                     for a in acts[d][t]['dialog_act']:
                         aa = a.lower().split('-')
-                        if aa[1] == 'inform' or aa[1] == 'recommend' or aa[
-                                1] == 'select' or aa[1] == 'book':
+                        if aa[1] == 'inform' or aa[1] == 'recommend' \
+                                or aa[1] == 'select' or aa[1] == 'book':
                             for i in acts[d][t]['dialog_act'][a]:
                                 s = i[0].lower()
                                 v = i[1].lower().strip()
@@ -524,7 +526,7 @@ class multiwoz22Processor(DSTProcessor):
                                 if key not in s_dict:
                                     s_dict[key] = list([v])
                                 # ... Option 2: Keep last informed value
-                                #s_dict[key] = list([v])
+                                # s_dict[key] = list([v])
         return s_dict
 
     # This should only contain label normalizations. All other mappings should
@@ -560,7 +562,7 @@ class multiwoz22Processor(DSTProcessor):
         utt_lower = convert_to_unicode(utt).lower()
         utt_lower = self.normalize_text(utt_lower)
         utt_tok = [
-            tok for tok in map(str.strip, re.split('(\W+)', utt_lower))
+            tok for tok in map(str.strip, re.split(r'(\W+)', utt_lower))
             if len(tok) > 0
         ]
         return utt_tok
@@ -582,7 +584,7 @@ class multiwoz22Processor(DSTProcessor):
         find_pos = []
         found = False
         label_list = [
-            item for item in map(str.strip, re.split('(\W+)', value_label))
+            item for item in map(str.strip, re.split(r'(\W+)', value_label))
             if len(item) > 0
         ]
         len_label = len(label_list)
@@ -633,11 +635,11 @@ class multiwoz22Processor(DSTProcessor):
     def is_in_list(self, tok, value):
         found = False
         tok_list = [
-            item for item in map(str.strip, re.split('(\W+)', tok))
+            item for item in map(str.strip, re.split(r'(\W+)', tok))
             if len(item) > 0
         ]
         value_list = [
-            item for item in map(str.strip, re.split('(\W+)', value))
+            item for item in map(str.strip, re.split(r'(\W+)', value))
             if len(item) > 0
         ]
         tok_len = len(tok_list)
@@ -938,8 +940,8 @@ class multiwoz22Processor(DSTProcessor):
                         if slot not in diag_seen_slots_dict or value_label != diag_seen_slots_value_dict[
                                 slot]:
                             print('(%s): %s, ' % (slot, value_label), end='')
-                elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[
-                        slot] and class_type != 'copy_value' and class_type != 'inform':
+                elif slot in diag_seen_slots_dict and class_type == diag_seen_slots_dict[slot] \
+                        and class_type != 'copy_value' and class_type != 'inform':
                     # If slot has seen before and its class type did not change, label this slot a not present,
                     # assuming that the slot has not actually been mentioned in this turn.
                     # Exceptions are copy_value and inform. If a seen slot has been tagged as copy_value or inform,
@@ -1262,7 +1264,7 @@ def convert_examples_to_features(examples,
 
     def _get_start_end_pos(class_type, token_label_ids, max_seq_length):
         if class_type == 'copy_value' and 1 not in token_label_ids:
-            #logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.")
+            # logger.warn("copy_value label, but token_label not detected. Setting label to 'none'.")
             class_type = 'none'
         start_pos = 0
         end_pos = 0
diff --git a/modelscope/preprocessors/space/fields/gen_field.py b/modelscope/preprocessors/space/fields/gen_field.py
index 7012697f..49a30e8f 100644
--- a/modelscope/preprocessors/space/fields/gen_field.py
+++ b/modelscope/preprocessors/space/fields/gen_field.py
@@ -8,10 +8,10 @@ from itertools import chain
 
 import numpy as np
 
-from modelscope.preprocessors.space.tokenizer import Tokenizer
-from modelscope.utils.nlp.space import ontology, utils
-from modelscope.utils.nlp.space.db_ops import MultiWozDB
-from modelscope.utils.nlp.space.utils import list2np
+from ....utils.nlp.space import ontology, utils
+from ....utils.nlp.space.db_ops import MultiWozDB
+from ....utils.nlp.space.utils import list2np
+from ..tokenizer import Tokenizer
 
 
 class BPETextField(object):
diff --git a/modelscope/preprocessors/space/fields/intent_field.py b/modelscope/preprocessors/space/fields/intent_field.py
index 9907165e..35e1693c 100644
--- a/modelscope/preprocessors/space/fields/intent_field.py
+++ b/modelscope/preprocessors/space/fields/intent_field.py
@@ -14,10 +14,10 @@ import json
 import numpy as np
 from tqdm import tqdm
 
-from modelscope.preprocessors.space.tokenizer import Tokenizer
-from modelscope.utils.nlp.space import ontology, utils
-from modelscope.utils.nlp.space.scores import hierarchical_set_score
-from modelscope.utils.nlp.space.utils import list2np
+from ....utils.nlp.space import ontology, utils
+from ....utils.nlp.space.scores import hierarchical_set_score
+from ....utils.nlp.space.utils import list2np
+from ..tokenizer import Tokenizer
 
 
 class BPETextField(object):
diff --git a/modelscope/preprocessors/text_to_speech.py b/modelscope/preprocessors/text_to_speech.py
index fd41b752..9d8af6fa 100644
--- a/modelscope/preprocessors/text_to_speech.py
+++ b/modelscope/preprocessors/text_to_speech.py
@@ -2,9 +2,8 @@
 import io
 from typing import Any, Dict, Union
 
-import ttsfrd
-
 from modelscope.fileio import File
+from modelscope.metainfo import Preprocessors
 from modelscope.models.audio.tts.frontend import GenericTtsFrontend
 from modelscope.models.base import Model
 from modelscope.utils.audio.tts_exceptions import *  # noqa F403
@@ -12,11 +11,11 @@ from modelscope.utils.constant import Fields
 from .base import Preprocessor
 from .builder import PREPROCESSORS
 
-__all__ = ['TextToTacotronSymbols', 'text_to_tacotron_symbols']
+__all__ = ['TextToTacotronSymbols']
 
 
 @PREPROCESSORS.register_module(
-    Fields.audio, module_name=r'text_to_tacotron_symbols')
+    Fields.audio, module_name=Preprocessors.text_to_tacotron_symbols)
 class TextToTacotronSymbols(Preprocessor):
     """extract tacotron symbols from text.
 
diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
new file mode 100644
index 00000000..262fdaa5
--- /dev/null
+++ b/modelscope/preprocessors/video.py
@@ -0,0 +1,232 @@
+import math
+import os
+import random
+
+import decord
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.data
+import torch.utils.dlpack as dlpack
+import torchvision.transforms._transforms_video as transforms
+from decord import VideoReader
+from torchvision.transforms import Compose
+
+
+def ReadVideoData(cfg, video_path):
+    """ simple interface to load video frames from file
+
+    Args:
+        cfg (Config): The global config object.
+        video_path (str): video file path
+    """
+    data = _decode_video(cfg, video_path)
+    transform = kinetics400_tranform(cfg)
+    data_list = []
+    for i in range(data.size(0)):
+        for j in range(cfg.TEST.NUM_SPATIAL_CROPS):
+            transform.transforms[1].set_spatial_index(j)
+            data_list.append(transform(data[i]))
+    return torch.stack(data_list, dim=0)
+
+
+def kinetics400_tranform(cfg):
+    """
+    Configs the transform for the kinetics-400 dataset.
+    We apply controlled spatial cropping and normalization.
+    Args:
+        cfg (Config): The global config object.
+    """
+    resize_video = KineticsResizedCrop(
+        short_side_range=[cfg.DATA.TEST_SCALE, cfg.DATA.TEST_SCALE],
+        crop_size=cfg.DATA.TEST_CROP_SIZE,
+        num_spatial_crops=cfg.TEST.NUM_SPATIAL_CROPS)
+    std_transform_list = [
+        transforms.ToTensorVideo(), resize_video,
+        transforms.NormalizeVideo(
+            mean=cfg.DATA.MEAN, std=cfg.DATA.STD, inplace=True)
+    ]
+    return Compose(std_transform_list)
+
+
+def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
+                             num_clips, num_frames, interval, minus_interval):
+    """
+        Generates the frame index list using interval based sampling.
+        Args:
+            vid_length  (int): the length of the whole video (valid selection range).
+            vid_fps     (int): the original video fps
+            target_fps  (int): the normalized video fps
+            clip_idx    (int): -1 for random temporal sampling, and positive values for
+                                sampling specific clip from the video
+            num_clips   (int): the total clips to be sampled from each video.
+                                combined with clip_idx, the sampled video is the "clip_idx-th"
+                                 video from "num_clips" videos.
+            num_frames  (int): number of frames in each sampled clips.
+            interval    (int): the interval to sample each frame.
+            minus_interval (bool): control the end index
+        Returns:
+            index (tensor): the sampled frame indexes
+        """
+    if num_frames == 1:
+        index = [random.randint(0, vid_length - 1)]
+    else:
+        # transform FPS
+        clip_length = num_frames * interval * vid_fps / target_fps
+
+        max_idx = max(vid_length - clip_length, 0)
+        start_idx = clip_idx * math.floor(max_idx / (num_clips - 1))
+        if minus_interval:
+            end_idx = start_idx + clip_length - interval
+        else:
+            end_idx = start_idx + clip_length - 1
+
+        index = torch.linspace(start_idx, end_idx, num_frames)
+        index = torch.clamp(index, 0, vid_length - 1).long()
+
+    return index
+
+
+def _decode_video_frames_list(cfg, frames_list, vid_fps):
+    """
+        Decodes the video given the numpy frames.
+        Args:
+            cfg          (Config): The global config object.
+            frames_list  (list):  all frames for a video, the frames should be numpy array.
+            vid_fps      (int):  the fps of this video.
+        Returns:
+            frames            (Tensor): video tensor data
+    """
+    assert isinstance(frames_list, list)
+    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+
+    frame_list = []
+    for clip_idx in range(num_clips_per_video):
+        # for each clip in the video,
+        # a list is generated before decoding the specified frames from the video
+        list_ = _interval_based_sampling(
+            len(frames_list), vid_fps, cfg.DATA.TARGET_FPS, clip_idx,
+            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+        frames = None
+        frames = torch.from_numpy(
+            np.stack([frames_list[l_index] for l_index in list_.tolist()],
+                     axis=0))
+        frame_list.append(frames)
+    frames = torch.stack(frame_list)
+    if num_clips_per_video == 1:
+        frames = frames.squeeze(0)
+
+    return frames
+
+
+def _decode_video(cfg, path):
+    """
+        Decodes the video given the numpy frames.
+        Args:
+            path          (str): video file path.
+        Returns:
+            frames            (Tensor): video tensor data
+    """
+    vr = VideoReader(path)
+
+    num_clips_per_video = cfg.TEST.NUM_ENSEMBLE_VIEWS
+
+    frame_list = []
+    for clip_idx in range(num_clips_per_video):
+        # for each clip in the video,
+        # a list is generated before decoding the specified frames from the video
+        list_ = _interval_based_sampling(
+            len(vr), vr.get_avg_fps(), cfg.DATA.TARGET_FPS, clip_idx,
+            num_clips_per_video, cfg.DATA.NUM_INPUT_FRAMES,
+            cfg.DATA.SAMPLING_RATE, cfg.DATA.MINUS_INTERVAL)
+        frames = None
+        if path.endswith('.avi'):
+            append_list = torch.arange(0, list_[0], 4)
+            frames = dlpack.from_dlpack(
+                vr.get_batch(torch.cat([append_list,
+                                        list_])).to_dlpack()).clone()
+            frames = frames[append_list.shape[0]:]
+        else:
+            frames = dlpack.from_dlpack(
+                vr.get_batch(list_).to_dlpack()).clone()
+        frame_list.append(frames)
+    frames = torch.stack(frame_list)
+    if num_clips_per_video == 1:
+        frames = frames.squeeze(0)
+    del vr
+    return frames
+
+
+class KineticsResizedCrop(object):
+    """Perform resize and crop for kinetics-400 dataset
+    Args:
+        short_side_range (list): The length of short side range. In inference, this shoudle be [256, 256]
+        crop_size         (int): The cropped size for frames.
+        num_spatial_crops (int): The number of the cropped spatial regions in each video.
+    """
+
+    def __init__(
+        self,
+        short_side_range,
+        crop_size,
+        num_spatial_crops=1,
+    ):
+        self.idx = -1
+        self.short_side_range = short_side_range
+        self.crop_size = int(crop_size)
+        self.num_spatial_crops = num_spatial_crops
+
+    def _get_controlled_crop(self, clip):
+        """Perform controlled crop for video tensor.
+        Args:
+            clip (Tensor): the video data, the shape is [T, C, H, W]
+        """
+        _, _, clip_height, clip_width = clip.shape
+
+        length = self.short_side_range[0]
+
+        if clip_height < clip_width:
+            new_clip_height = int(length)
+            new_clip_width = int(clip_width / clip_height * new_clip_height)
+            new_clip = torch.nn.functional.interpolate(
+                clip, size=(new_clip_height, new_clip_width), mode='bilinear')
+        else:
+            new_clip_width = int(length)
+            new_clip_height = int(clip_height / clip_width * new_clip_width)
+            new_clip = torch.nn.functional.interpolate(
+                clip, size=(new_clip_height, new_clip_width), mode='bilinear')
+        x_max = int(new_clip_width - self.crop_size)
+        y_max = int(new_clip_height - self.crop_size)
+        if self.num_spatial_crops == 1:
+            x = x_max // 2
+            y = y_max // 2
+        elif self.num_spatial_crops == 3:
+            if self.idx == 0:
+                if new_clip_width == length:
+                    x = x_max // 2
+                    y = 0
+                elif new_clip_height == length:
+                    x = 0
+                    y = y_max // 2
+            elif self.idx == 1:
+                x = x_max // 2
+                y = y_max // 2
+            elif self.idx == 2:
+                if new_clip_width == length:
+                    x = x_max // 2
+                    y = y_max
+                elif new_clip_height == length:
+                    x = x_max
+                    y = y_max // 2
+        return new_clip[:, :, y:y + self.crop_size, x:x + self.crop_size]
+
+    def set_spatial_index(self, idx):
+        """Set the spatial cropping index for controlled cropping..
+        Args:
+            idx (int): the spatial index. The value should be in [0, 1, 2], means [left, center, right], respectively.
+        """
+        self.idx = idx
+
+    def __call__(self, clip):
+        return self._get_controlled_crop(clip)
diff --git a/modelscope/pydatasets/config.py b/modelscope/pydatasets/config.py
new file mode 100644
index 00000000..e916b3ec
--- /dev/null
+++ b/modelscope/pydatasets/config.py
@@ -0,0 +1,22 @@
+import os
+from pathlib import Path
+
+# Cache location
+DEFAULT_CACHE_HOME = '~/.cache'
+CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
+DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope/hub')
+MS_CACHE_HOME = os.path.expanduser(
+    os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME))
+
+DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets')
+MS_DATASETS_CACHE = Path(
+    os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE))
+
+DOWNLOADED_DATASETS_DIR = 'downloads'
+DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(MS_DATASETS_CACHE,
+                                                DOWNLOADED_DATASETS_DIR)
+DOWNLOADED_DATASETS_PATH = Path(
+    os.getenv('DOWNLOADED_DATASETS_PATH', DEFAULT_DOWNLOADED_DATASETS_PATH))
+
+MS_HUB_ENDPOINT = os.environ.get('MS_HUB_ENDPOINT',
+                                 'http://101.201.119.157:31752')
diff --git a/modelscope/pydatasets/py_dataset.py b/modelscope/pydatasets/py_dataset.py
index 78aedaa0..49137253 100644
--- a/modelscope/pydatasets/py_dataset.py
+++ b/modelscope/pydatasets/py_dataset.py
@@ -1,64 +1,81 @@
-from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
-                    Union)
+import os
+from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
+                    Sequence, Union)
 
-from datasets import Dataset, load_dataset
+import numpy as np
+from datasets import Dataset
+from datasets import load_dataset as hf_load_dataset
+from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
+from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
+from datasets.utils.file_utils import (is_relative_path,
+                                       relative_to_absolute_path)
 
+from modelscope.pydatasets.config import MS_DATASETS_CACHE
+from modelscope.pydatasets.utils.ms_api import MsApi
 from modelscope.utils.constant import Hubs
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 
 
+def format_list(para) -> List:
+    if para is None:
+        para = []
+    elif isinstance(para, str):
+        para = [para]
+    elif len(set(para)) < len(para):
+        raise ValueError(f'List columns contains duplicates: {para}')
+    return para
+
+
 class PyDataset:
     _hf_ds = None  # holds the underlying HuggingFace Dataset
     """A PyDataset backed by hugging face Dataset."""
 
-    def __init__(self, hf_ds: Dataset):
+    def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
         self._hf_ds = hf_ds
-        self.target = None
+        self.target = target
 
     def __iter__(self):
-        if isinstance(self._hf_ds, Dataset):
-            for item in self._hf_ds:
-                if self.target is not None:
-                    yield item[self.target]
-                else:
-                    yield item
-        else:
-            for ds in self._hf_ds.values():
-                for item in ds:
-                    if self.target is not None:
-                        yield item[self.target]
-                    else:
-                        yield item
+        for item in self._hf_ds:
+            if self.target is not None:
+                yield item[self.target]
+            else:
+                yield item
+
+    def __getitem__(self, key):
+        return self._hf_ds[key]
 
     @classmethod
     def from_hf_dataset(cls,
                         hf_ds: Dataset,
-                        target: str = None) -> 'PyDataset':
-        dataset = cls(hf_ds)
-        dataset.target = target
-        return dataset
+                        target: str = None) -> Union[dict, 'PyDataset']:
+        if isinstance(hf_ds, Dataset):
+            return cls(hf_ds, target)
+        if len(hf_ds.keys()) == 1:
+            return cls(next(iter(hf_ds.values())), target)
+        return {k: cls(v, target) for k, v in hf_ds.items()}
 
     @staticmethod
-    def load(path: Union[str, list],
-             target: Optional[str] = None,
-             version: Optional[str] = None,
-             name: Optional[str] = None,
-             split: Optional[str] = None,
-             data_dir: Optional[str] = None,
-             data_files: Optional[Union[str, Sequence[str],
-                                        Mapping[str,
-                                                Union[str,
-                                                      Sequence[str]]]]] = None,
-             hub: Optional[Hubs] = None) -> 'PyDataset':
+    def load(
+        dataset_name: Union[str, list],
+        target: Optional[str] = None,
+        version: Optional[str] = None,
+        hub: Optional[Hubs] = Hubs.modelscope,
+        subset_name: Optional[str] = None,
+        split: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, Sequence[str],
+                                   Mapping[str, Union[str,
+                                                      Sequence[str]]]]] = None
+    ) -> Union[dict, 'PyDataset']:
         """Load a PyDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
             Args:
 
-                path (str): Path or name of the dataset.
+                dataset_name (str): Path or name of the dataset.
                 target (str, optional): Name of the column to output.
                 version (str, optional): Version of the dataset script to load:
-                name (str, optional): Defining the subset_name of the dataset.
+                subset_name (str, optional): Defining the subset_name of the dataset.
                 data_dir (str, optional): Defining the data_dir of the dataset configuration. I
                 data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                 split (str, optional): Which split of the data to load.
@@ -67,53 +84,302 @@ class PyDataset:
             Returns:
                 PyDataset (obj:`PyDataset`): PyDataset object for a certain dataset.
             """
-        if Hubs.modelscope == hub:
-            # TODO: parse data meta information from modelscope hub
-            # and possibly download data files to local (and update path)
-            print('getting data from modelscope hub')
-        if isinstance(path, str):
-            dataset = load_dataset(
-                path,
-                name=name,
+        if hub == Hubs.huggingface:
+            dataset = hf_load_dataset(
+                dataset_name,
+                name=subset_name,
                 revision=version,
                 split=split,
                 data_dir=data_dir,
                 data_files=data_files)
-        elif isinstance(path, list):
+            return PyDataset.from_hf_dataset(dataset, target=target)
+        else:
+            return PyDataset._load_ms_dataset(
+                dataset_name,
+                target=target,
+                subset_name=subset_name,
+                version=version,
+                split=split,
+                data_dir=data_dir,
+                data_files=data_files)
+
+    @staticmethod
+    def _load_ms_dataset(
+        dataset_name: Union[str, list],
+        target: Optional[str] = None,
+        version: Optional[str] = None,
+        subset_name: Optional[str] = None,
+        split: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, Sequence[str],
+                                   Mapping[str, Union[str,
+                                                      Sequence[str]]]]] = None
+    ) -> Union[dict, 'PyDataset']:
+        if isinstance(dataset_name, str):
+            use_hf = False
+            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(dataset_name) or \
+                    (os.path.isfile(dataset_name) and dataset_name.endswith('.py')):
+                use_hf = True
+            elif is_relative_path(dataset_name):
+                ms_api = MsApi()
+                dataset_scripts = ms_api.fetch_dataset_scripts(
+                    dataset_name, version)
+                if 'py' in dataset_scripts:  # dataset copied from hf datasets
+                    dataset_name = dataset_scripts['py'][0]
+                    use_hf = True
+            else:
+                raise FileNotFoundError(
+                    f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
+                    f'or any data file in the same directory.')
+
+            if use_hf:
+                dataset = hf_load_dataset(
+                    dataset_name,
+                    name=subset_name,
+                    revision=version,
+                    split=split,
+                    data_dir=data_dir,
+                    data_files=data_files,
+                    cache_dir=MS_DATASETS_CACHE)
+            else:
+                # TODO load from ms datahub
+                raise NotImplementedError(
+                    f'Dataset {dataset_name} load from modelscope datahub to be implemented in '
+                    f'the future')
+        elif isinstance(dataset_name, list):
             if target is None:
                 target = 'target'
-            dataset = Dataset.from_dict({target: [p] for p in path})
+            dataset = Dataset.from_dict({target: dataset_name})
         else:
             raise TypeError('path must be a str or a list, but got'
-                            f' {type(path)}')
+                            f' {type(dataset_name)}')
         return PyDataset.from_hf_dataset(dataset, target=target)
 
+    def to_torch_dataset_with_processors(
+        self,
+        preprocessors: Union[Callable, List[Callable]],
+        columns: Union[str, List[str]] = None,
+    ):
+        preprocessor_list = preprocessors if isinstance(
+            preprocessors, list) else [preprocessors]
+
+        columns = format_list(columns)
+
+        columns = [
+            key for key in self._hf_ds.features.keys() if key in columns
+        ]
+        sample = next(iter(self._hf_ds))
+
+        sample_res = {k: np.array(sample[k]) for k in columns}
+        for processor in preprocessor_list:
+            sample_res.update(
+                {k: np.array(v)
+                 for k, v in processor(sample).items()})
+
+        def is_numpy_number(value):
+            return np.issubdtype(value.dtype, np.integer) or np.issubdtype(
+                value.dtype, np.floating)
+
+        retained_columns = []
+        for k in sample_res.keys():
+            if not is_numpy_number(sample_res[k]):
+                logger.warning(
+                    f'Data of column {k} is non-numeric, will be removed')
+                continue
+            retained_columns.append(k)
+
+        import torch
+
+        class MsIterableDataset(torch.utils.data.IterableDataset):
+
+            def __init__(self, dataset: Iterable):
+                super(MsIterableDataset).__init__()
+                self.dataset = dataset
+
+            def __iter__(self):
+                for item_dict in self.dataset:
+                    res = {
+                        k: np.array(item_dict[k])
+                        for k in columns if k in retained_columns
+                    }
+                    for preprocessor in preprocessor_list:
+                        res.update({
+                            k: np.array(v)
+                            for k, v in preprocessor(item_dict).items()
+                            if k in retained_columns
+                        })
+                    yield res
+
+        return MsIterableDataset(self._hf_ds)
+
     def to_torch_dataset(
         self,
         columns: Union[str, List[str]] = None,
-        output_all_columns: bool = False,
+        preprocessors: Union[Callable, List[Callable]] = None,
         **format_kwargs,
     ):
-        self._hf_ds.reset_format()
-        self._hf_ds.set_format(
-            type='torch',
-            columns=columns,
-            output_all_columns=output_all_columns,
-            format_kwargs=format_kwargs)
-        return self._hf_ds
+        """Create a torch.utils.data.Dataset from the MS Dataset. The torch.utils.data.Dataset can be passed to
+           torch.utils.data.DataLoader.
+
+        Args:
+            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
+                every sample of the dataset. The output type of processors is dict, and each numeric field of the dict
+                will be used as a field of torch.utils.data.Dataset.
+            columns (str or List[str], default None): Dataset column(s) to be loaded (numeric data only). If the
+                preprocessor is None, the arg columns must have at least one column. If the `preprocessors` is not None,
+                the output fields of processors will also be added.
+            format_kwargs: A `dict` of arguments to be passed to the `torch.tensor`.
+
+        Returns:
+            :class:`tf.data.Dataset`
+
+        """
+        if not TORCH_AVAILABLE:
+            raise ImportError(
+                'The function to_torch_dataset requires pytorch to be installed'
+            )
+        if preprocessors is not None:
+            return self.to_torch_dataset_with_processors(preprocessors)
+        else:
+            self._hf_ds.reset_format()
+            self._hf_ds.set_format(
+                type='torch', columns=columns, format_kwargs=format_kwargs)
+            return self._hf_ds
+
+    def to_tf_dataset_with_processors(
+        self,
+        batch_size: int,
+        shuffle: bool,
+        preprocessors: Union[Callable, List[Callable]],
+        drop_remainder: bool = None,
+        prefetch: bool = True,
+        label_cols: Union[str, List[str]] = None,
+        columns: Union[str, List[str]] = None,
+    ):
+        preprocessor_list = preprocessors if isinstance(
+            preprocessors, list) else [preprocessors]
+
+        label_cols = format_list(label_cols)
+        columns = format_list(columns)
+        cols_to_retain = list(set(label_cols + columns))
+        retained_columns = [
+            key for key in self._hf_ds.features.keys() if key in cols_to_retain
+        ]
+        import tensorflow as tf
+        tf_dataset = tf.data.Dataset.from_tensor_slices(
+            np.arange(len(self._hf_ds), dtype=np.int64))
+        if shuffle:
+            tf_dataset = tf_dataset.shuffle(buffer_size=len(self._hf_ds))
+
+        def func(i, return_dict=False):
+            i = int(i)
+            res = {k: np.array(self._hf_ds[i][k]) for k in retained_columns}
+            for preprocessor in preprocessor_list:
+                # TODO preprocessor output may have the same key
+                res.update({
+                    k: np.array(v)
+                    for k, v in preprocessor(self._hf_ds[i]).items()
+                })
+            if return_dict:
+                return res
+            return tuple(list(res.values()))
+
+        sample_res = func(0, True)
+
+        @tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
+        def fetch_function(i):
+            output = tf.numpy_function(
+                func,
+                inp=[i],
+                Tout=[
+                    tf.dtypes.as_dtype(val.dtype)
+                    for val in sample_res.values()
+                ],
+            )
+            return {key: output[i] for i, key in enumerate(sample_res)}
+
+        tf_dataset = tf_dataset.map(
+            fetch_function, num_parallel_calls=tf.data.AUTOTUNE)
+        if label_cols:
+
+            def split_features_and_labels(input_batch):
+                labels = {
+                    key: tensor
+                    for key, tensor in input_batch.items() if key in label_cols
+                }
+                if len(input_batch) == 1:
+                    input_batch = next(iter(input_batch.values()))
+                if len(labels) == 1:
+                    labels = next(iter(labels.values()))
+                return input_batch, labels
+
+            tf_dataset = tf_dataset.map(split_features_and_labels)
+
+        elif len(columns) == 1:
+            tf_dataset = tf_dataset.map(lambda x: next(iter(x.values())))
+        if batch_size > 1:
+            tf_dataset = tf_dataset.batch(
+                batch_size, drop_remainder=drop_remainder)
+
+        if prefetch:
+            tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
+        return tf_dataset
 
     def to_tf_dataset(
         self,
-        columns: Union[str, List[str]],
         batch_size: int,
         shuffle: bool,
-        collate_fn: Callable,
+        preprocessors: Union[Callable, List[Callable]] = None,
+        columns: Union[str, List[str]] = None,
+        collate_fn: Callable = None,
         drop_remainder: bool = None,
         collate_fn_args: Dict[str, Any] = None,
         label_cols: Union[str, List[str]] = None,
-        dummy_labels: bool = False,
         prefetch: bool = True,
     ):
+        """Create a tf.data.Dataset from the MS Dataset. This tf.data.Dataset can be passed to tf methods like
+           model.fit() or model.predict().
+
+        Args:
+            batch_size (int): Number of samples in a single batch.
+            shuffle(bool): Shuffle the dataset order.
+            preprocessors (Callable or List[Callable], default None): (list of) Preprocessor object used to process
+                every sample of the dataset. The output type of processors is dict, and each field of the dict will be
+                used as a field of the tf.data. Dataset. If the `preprocessors` is None, the `collate_fn`
+                shouldn't be None.
+            columns (str or List[str], default None): Dataset column(s) to be loaded. If the preprocessor is None,
+                the arg columns must have at least one column. If the `preprocessors` is not None, the output fields of
+                processors will also be added.
+            collate_fn(Callable, default None): A callable object used to collect lists of samples into a batch. If
+                the `preprocessors` is None, the `collate_fn` shouldn't be None.
+            drop_remainder(bool, default None): Drop the last incomplete batch when loading.
+            collate_fn_args (Dict, optional): A `dict` of arguments to be passed to the`collate_fn`.
+            label_cols (str or List[str], defalut None): Dataset column(s) to load as labels.
+            prefetch (bool, default True): Prefetch data.
+
+        Returns:
+            :class:`tf.data.Dataset`
+
+        """
+        if not TF_AVAILABLE:
+            raise ImportError(
+                'The function to_tf_dataset requires Tensorflow to be installed.'
+            )
+        if preprocessors is not None:
+            return self.to_tf_dataset_with_processors(
+                batch_size,
+                shuffle,
+                preprocessors,
+                drop_remainder=drop_remainder,
+                prefetch=prefetch,
+                label_cols=label_cols,
+                columns=columns)
+
+        if collate_fn is None:
+            logger.error(
+                'The `preprocessors` and the `collate_fn` should`t be both None.'
+            )
+            return None
         self._hf_ds.reset_format()
         return self._hf_ds.to_tf_dataset(
             columns,
@@ -123,7 +389,6 @@ class PyDataset:
             drop_remainder=drop_remainder,
             collate_fn_args=collate_fn_args,
             label_cols=label_cols,
-            dummy_labels=dummy_labels,
             prefetch=prefetch)
 
     def to_hf_dataset(self) -> Dataset:
diff --git a/modelscope/pydatasets/utils/__init__.py b/modelscope/pydatasets/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/pydatasets/utils/ms_api.py b/modelscope/pydatasets/utils/ms_api.py
new file mode 100644
index 00000000..04052cc4
--- /dev/null
+++ b/modelscope/pydatasets/utils/ms_api.py
@@ -0,0 +1,66 @@
+import os
+from collections import defaultdict
+from typing import Optional
+
+import requests
+
+from modelscope.pydatasets.config import (DOWNLOADED_DATASETS_PATH,
+                                          MS_HUB_ENDPOINT)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class MsApi:
+
+    def __init__(self, endpoint=MS_HUB_ENDPOINT):
+        self.endpoint = endpoint
+
+    def list_datasets(self):
+        path = f'{self.endpoint}/api/v1/datasets'
+        headers = None
+        params = {}
+        r = requests.get(path, params=params, headers=headers)
+        r.raise_for_status()
+        dataset_list = r.json()['Data']
+        return [x['Name'] for x in dataset_list]
+
+    def fetch_dataset_scripts(self,
+                              dataset_name: str,
+                              version: Optional[str] = 'master',
+                              force_download=False):
+        datahub_url = f'{self.endpoint}/api/v1/datasets?Query={dataset_name}'
+        r = requests.get(datahub_url)
+        r.raise_for_status()
+        dataset_list = r.json()['Data']
+        if len(dataset_list) == 0:
+            return None
+        dataset_id = dataset_list[0]['Id']
+        version = version or 'master'
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={version}'
+        r = requests.get(datahub_url)
+        r.raise_for_status()
+        file_list = r.json()['Data']['Files']
+        cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, dataset_name,
+                                 version)
+        os.makedirs(cache_dir, exist_ok=True)
+        local_paths = defaultdict(list)
+        for file_info in file_list:
+            file_path = file_info['Path']
+            if file_path.endswith('.py'):
+                datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/files?' \
+                              f'Revision={version}&Path={file_path}'
+                r = requests.get(datahub_url)
+                r.raise_for_status()
+                content = r.json()['Data']['Content']
+                local_path = os.path.join(cache_dir, file_path)
+                if os.path.exists(local_path) and not force_download:
+                    logger.warning(
+                        f"Reusing dataset {dataset_name}'s python file ({local_path})"
+                    )
+                    local_paths['py'].append(local_path)
+                    continue
+                with open(local_path, 'w') as f:
+                    f.writelines(content)
+                local_paths['py'].append(local_path)
+        return local_paths
diff --git a/modelscope/trainers/nlp/space/trainers/gen_trainer.py b/modelscope/trainers/nlp/space/trainers/gen_trainer.py
index a0cda25c..e09e2100 100644
--- a/modelscope/trainers/nlp/space/trainers/gen_trainer.py
+++ b/modelscope/trainers/nlp/space/trainers/gen_trainer.py
@@ -13,7 +13,7 @@ import torch
 from tqdm import tqdm
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
-import modelscope.utils.nlp.space.ontology as ontology
+from .....utils.nlp.space import ontology
 from ..metrics.metrics_tracker import MetricsTracker
 
 
diff --git a/modelscope/trainers/nlp/space/trainers/intent_trainer.py b/modelscope/trainers/nlp/space/trainers/intent_trainer.py
index bd43e9a5..2c5081d7 100644
--- a/modelscope/trainers/nlp/space/trainers/intent_trainer.py
+++ b/modelscope/trainers/nlp/space/trainers/intent_trainer.py
@@ -14,9 +14,7 @@ import torch
 from tqdm import tqdm
 from transformers.optimization import AdamW, get_linear_schedule_with_warmup
 
-from modelscope.trainers.nlp.space.metrics.metrics_tracker import \
-    MetricsTracker
-from modelscope.utils.nlp.space.args import str2bool
+from ..metrics.metrics_tracker import MetricsTracker
 
 
 def get_logger(log_path, name='default'):
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index d89f0496..cd232c6a 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -28,9 +28,13 @@ class Tasks(object):
     image_editing = 'image-editing'
     image_generation = 'image-generation'
     image_matting = 'image-matting'
+    ocr_detection = 'ocr-detection'
+    action_recognition = 'action-recognition'
 
     # nlp tasks
     word_segmentation = 'word-segmentation'
+    nli = 'nli'
+    sentiment_classification = 'sentiment-classification'
     sentiment_analysis = 'sentiment-analysis'
     sentence_similarity = 'sentence-similarity'
     text_classification = 'text-classification'
@@ -45,8 +49,7 @@ class Tasks(object):
     dialog_state_tracking = 'dialog-state-tracking'
     table_question_answering = 'table-question-answering'
     feature_extraction = 'feature-extraction'
-    sentence_similarity = 'sentence-similarity'
-    fill_mask = 'fill-mask '
+    fill_mask = 'fill-mask'
     summarization = 'summarization'
     question_answering = 'question-answering'
 
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 2f61b148..868e751b 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -1,14 +1,67 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import os.path as osp
+from typing import List, Optional, Union
 
-from maas_hub.constants import MODEL_ID_SEPARATOR
+from requests import HTTPError
 
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
 
-# temp solution before the hub-cache is in place
-def get_model_cache_dir(model_id: str, branch: str = 'master'):
-    model_id_expanded = model_id.replace('/',
-                                         MODEL_ID_SEPARATOR) + '.' + branch
-    default_cache_dir = os.path.expanduser(os.path.join('~/.cache', 'maas'))
-    return os.getenv('MAAS_CACHE',
-                     os.path.join(default_cache_dir, 'hub', model_id_expanded))
+
+def create_model_if_not_exist(
+        api,
+        model_id: str,
+        chinese_name: str,
+        visibility: Optional[int] = 5,  # 1-private, 5-public
+        license: Optional[str] = 'apache-2.0',
+        revision: Optional[str] = 'master'):
+    exists = True
+    try:
+        api.get_model(model_id=model_id, revision=revision)
+    except HTTPError:
+        exists = False
+    if exists:
+        print(f'model {model_id} already exists, skip creation.')
+        return False
+    else:
+        api.create_model(
+            model_id=model_id,
+            chinese_name=chinese_name,
+            visibility=visibility,
+            license=license)
+        print(f'model {model_id} successfully created.')
+        return True
+
+
+def read_config(model_id_or_path: str):
+    """ Read config from hub or local path
+
+    Args:
+        model_id_or_path (str): Model repo name or local directory path.
+
+    Return:
+        config (:obj:`Config`): config object
+    """
+    if not os.path.exists(model_id_or_path):
+        local_path = model_file_download(model_id_or_path,
+                                         ModelFile.CONFIGURATION)
+    else:
+        local_path = os.path.join(model_id_or_path, ModelFile.CONFIGURATION)
+
+    return Config.from_file(local_path)
+
+
+def auto_load(model: Union[str, List[str]]):
+    if isinstance(model, str):
+        if not osp.exists(model):
+            model = snapshot_download(model)
+    else:
+        model = [
+            snapshot_download(m) if not osp.exists(m) else m for m in model
+        ]
+
+    return model
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index b26b899d..8009b084 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -78,7 +78,7 @@ class Registry(object):
                                f'{self._name}[{default_group}] and will '
                                'be overwritten')
                 logger.warning(f'{self._modules[default_group][module_name]}'
-                               'to {module_cls}')
+                               f'to {module_cls}')
         # also register module in the default group for faster access
         # only by module name
         self._modules[default_group][module_name] = module_cls
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index c8ea0442..95e63dba 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -2,6 +2,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import unittest
+
+from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
 
 TEST_LEVEL = 2
 TEST_LEVEL_STR = 'TEST_LEVEL'
@@ -15,6 +18,18 @@ def test_level():
     return TEST_LEVEL
 
 
+def require_tf(test_case):
+    if not TF_AVAILABLE:
+        test_case = unittest.skip('test requires TensorFlow')(test_case)
+    return test_case
+
+
+def require_torch(test_case):
+    if not TORCH_AVAILABLE:
+        test_case = unittest.skip('test requires PyTorch')(test_case)
+    return test_case
+
+
 def set_test_level(level: int):
     global TEST_LEVEL
     TEST_LEVEL = level
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 140836a8..c7b2b239 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,25 +1,25 @@
 #tts
-h5py==2.10.0
-#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp36-cp36m-linux_x86_64.whl
-https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp37-cp37m-linux_x86_64.whl
-https://swap.oss-cn-hangzhou.aliyuncs.com/Jiaqi%2Fmaas%2Ftts%2Frequirements%2Fpytorch_wavelets-1.3.0-py3-none-any.whl?Expires=1685688388&OSSAccessKeyId=LTAI4Ffebq4d9jTVDwiSbY4L&Signature=jcQbg5EZ%2Bdys3%2F4BRn3srrKLdIg%3D
-#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp38-cp38-linux_x86_64.whl
-#https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.1-cp39-cp39-linux_x86_64.whl
+h5py
+https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/pytorch_wavelets-1.3.0-py3-none-any.whl
+https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp36-cp36m-linux_x86_64.whl; python_version=='3.6'
+https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp37-cp37m-linux_x86_64.whl; python_version=='3.7'
+https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp38-cp38-linux_x86_64.whl; python_version=='3.8'
+https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/TTS/requirements/ttsfrd-0.0.2-cp39-cp39-linux_x86_64.whl; python_version=='3.9'
 inflect
-keras==2.2.4
+keras
 librosa
 lxml
 matplotlib
 nara_wpe
-numpy==1.18.*
-protobuf==3.20.*
+numpy
+protobuf>3,<=3.20
 ptflops
 PyWavelets>=1.0.0
-scikit-learn==0.23.2
+scikit-learn
 sox
 tensorboard
 tensorflow==1.15.*
-torch==1.10.*
+torch
 torchaudio
 torchvision
 tqdm
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 66799b76..513dae99 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -1 +1,3 @@
+decord>=0.6.0
 easydict
+tf_slim
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index eefb3c7d..bc0b3fcd 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -1,4 +1,5 @@
-https://alinlp.alibaba-inc.com/pypi/sofa-1.0.2-py3-none-any.whl
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
+# https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
+http://ait-public.oss-cn-hangzhou-zmf.aliyuncs.com/jizhu/en_core_web_sm-2.3.1.tar.gz
+https://alinlp.alibaba-inc.com/pypi/sofa-1.0.3-py3-none-any.whl
 spacy>=2.3.5
 # python -m spacy download en_core_web_sm
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index e97352aa..6580de53 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,13 +1,16 @@
 addict
 datasets
 easydict
-https://mindscope.oss-cn-hangzhou.aliyuncs.com/sdklib/maas_hub-0.2.4.dev0-py3-none-any.whl
+filelock>=3.3.0
 numpy
 opencv-python-headless
 Pillow>=6.2.0
 pyyaml
 requests
+requests==2.27.1
 scipy
+setuptools==58.0.4
 tokenizers<=0.10.3
+tqdm>=4.64.0
 transformers<=4.16.2
 yapf
diff --git a/tests/hub/__init__.py b/tests/hub/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/hub/test_hub_examples.py b/tests/hub/test_hub_examples.py
new file mode 100644
index 00000000..b63445af
--- /dev/null
+++ b/tests/hub/test_hub_examples.py
@@ -0,0 +1,33 @@
+import unittest
+
+from maas_hub.maas_api import MaasApi
+
+from modelscope.utils.hub import create_model_if_not_exist
+
+USER_NAME = 'maasadmin'
+PASSWORD = '12345678'
+
+
+class HubExampleTest(unittest.TestCase):
+
+    def setUp(self):
+        self.api = MaasApi()
+        # note this is temporary before official account management is ready
+        self.api.login(USER_NAME, PASSWORD)
+
+    @unittest.skip('to be used for local test only')
+    def test_example_model_creation(self):
+        # ATTENTION:change to proper model names before use
+        model_name = 'cv_unet_person-image-cartoon_compound-models'
+        model_chinese_name = '达摩卡通化模型'
+        model_org = 'damo'
+        model_id = '%s/%s' % (model_org, model_name)
+
+        created = create_model_if_not_exist(self.api, model_id,
+                                            model_chinese_name)
+        if not created:
+            print('!! NOT created since model already exists !!')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
new file mode 100644
index 00000000..d44cd7c1
--- /dev/null
+++ b/tests/hub/test_hub_operation.py
@@ -0,0 +1,155 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import subprocess
+import tempfile
+import unittest
+import uuid
+
+from modelscope.hub.api import HubApi, ModelScopeConfig
+from modelscope.hub.file_download import model_file_download
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.hub.utils.utils import get_gitlab_domain
+
+USER_NAME = 'maasadmin'
+PASSWORD = '12345678'
+
+model_chinese_name = '达摩卡通化模型'
+model_org = 'unittest'
+DEFAULT_GIT_PATH = 'git'
+
+
+class GitError(Exception):
+    pass
+
+
+# TODO make thest git operation to git library after merge code.
+def run_git_command(git_path, *args) -> subprocess.CompletedProcess:
+    response = subprocess.run([git_path, *args], capture_output=True)
+    try:
+        response.check_returncode()
+        return response.stdout.decode('utf8')
+    except subprocess.CalledProcessError as error:
+        raise GitError(error.stderr.decode('utf8'))
+
+
+# for public project, token can None, private repo, there must token.
+def clone(local_dir: str, token: str, url: str):
+    url = url.replace('//', '//oauth2:%s@' % token)
+    clone_args = '-C %s clone %s' % (local_dir, url)
+    clone_args = clone_args.split(' ')
+    stdout = run_git_command(DEFAULT_GIT_PATH, *clone_args)
+    print('stdout: %s' % stdout)
+
+
+def push(local_dir: str, token: str, url: str):
+    url = url.replace('//', '//oauth2:%s@' % token)
+    push_args = '-C %s push %s' % (local_dir, url)
+    push_args = push_args.split(' ')
+    stdout = run_git_command(DEFAULT_GIT_PATH, *push_args)
+    print('stdout: %s' % stdout)
+
+
+sample_model_url = 'https://mindscope.oss-cn-hangzhou.aliyuncs.com/test_models/mnist-12.onnx'
+download_model_file_name = 'mnist-12.onnx'
+
+
+class HubOperationTest(unittest.TestCase):
+
+    def setUp(self):
+        self.old_cwd = os.getcwd()
+        self.api = HubApi()
+        # note this is temporary before official account management is ready
+        self.api.login(USER_NAME, PASSWORD)
+        self.model_name = uuid.uuid4().hex
+        self.model_id = '%s/%s' % (model_org, self.model_name)
+        self.api.create_model(
+            model_id=self.model_id,
+            chinese_name=model_chinese_name,
+            visibility=5,  # 1-private, 5-public
+            license='apache-2.0')
+
+    def tearDown(self):
+        os.chdir(self.old_cwd)
+        self.api.delete_model(model_id=self.model_id)
+
+    def test_model_repo_creation(self):
+        # change to proper model names before use
+        try:
+            info = self.api.get_model(model_id=self.model_id)
+            assert info['Name'] == self.model_name
+        except KeyError as ke:
+            if ke.args[0] == 'name':
+                print(f'model {self.model_name} already exists, ignore')
+            else:
+                raise
+
+    # Note that this can be done via git operation once model repo
+    # has been created. Git-Op is the RECOMMENDED model upload approach
+    def test_model_upload(self):
+        url = f'http://{get_gitlab_domain()}/{self.model_id}'
+        print(url)
+        temporary_dir = tempfile.mkdtemp()
+        os.chdir(temporary_dir)
+        cmd_args = 'clone %s' % url
+        cmd_args = cmd_args.split(' ')
+        out = run_git_command('git', *cmd_args)
+        print(out)
+        repo_dir = os.path.join(temporary_dir, self.model_name)
+        os.chdir(repo_dir)
+        os.system('touch file1')
+        os.system('git add file1')
+        os.system("git commit -m 'Test'")
+        token = ModelScopeConfig.get_token()
+        push(repo_dir, token, url)
+
+    def test_download_single_file(self):
+        url = f'http://{get_gitlab_domain()}/{self.model_id}'
+        print(url)
+        temporary_dir = tempfile.mkdtemp()
+        os.chdir(temporary_dir)
+        os.system('git clone %s' % url)
+        repo_dir = os.path.join(temporary_dir, self.model_name)
+        os.chdir(repo_dir)
+        os.system('wget %s' % sample_model_url)
+        os.system('git add .')
+        os.system("git commit -m 'Add file'")
+        token = ModelScopeConfig.get_token()
+        push(repo_dir, token, url)
+        assert os.path.exists(
+            os.path.join(temporary_dir, self.model_name,
+                         download_model_file_name))
+        downloaded_file = model_file_download(
+            model_id=self.model_id, file_path=download_model_file_name)
+        mdtime1 = os.path.getmtime(downloaded_file)
+        # download again
+        downloaded_file = model_file_download(
+            model_id=self.model_id, file_path=download_model_file_name)
+        mdtime2 = os.path.getmtime(downloaded_file)
+        assert mdtime1 == mdtime2
+
+    def test_snapshot_download(self):
+        url = f'http://{get_gitlab_domain()}/{self.model_id}'
+        print(url)
+        temporary_dir = tempfile.mkdtemp()
+        os.chdir(temporary_dir)
+        os.system('git clone %s' % url)
+        repo_dir = os.path.join(temporary_dir, self.model_name)
+        os.chdir(repo_dir)
+        os.system('wget %s' % sample_model_url)
+        os.system('git add .')
+        os.system("git commit -m 'Add file'")
+        token = ModelScopeConfig.get_token()
+        push(repo_dir, token, url)
+        snapshot_path = snapshot_download(model_id=self.model_id)
+        downloaded_file_path = os.path.join(snapshot_path,
+                                            download_model_file_name)
+        assert os.path.exists(downloaded_file_path)
+        mdtime1 = os.path.getmtime(downloaded_file_path)
+        # download again
+        snapshot_path = snapshot_download(model_id=self.model_id)
+        mdtime2 = os.path.getmtime(downloaded_file_path)
+        assert mdtime1 == mdtime2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/nlp/test_dialog_state_tracking.py b/tests/pipelines/nlp/test_dialog_state_tracking.py
index a6c989bd..41ef7981 100644
--- a/tests/pipelines/nlp/test_dialog_state_tracking.py
+++ b/tests/pipelines/nlp/test_dialog_state_tracking.py
@@ -1,8 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+import tempfile
 import unittest
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import DialogStateTrackingModel
 from modelscope.pipelines import DialogStateTrackingPipeline, pipeline
@@ -10,50 +12,31 @@ from modelscope.preprocessors import DialogStateTrackingPreprocessor
 from modelscope.utils.constant import Tasks
 
 
-class DialogIntentPredictionTest(unittest.TestCase):
-    model_id = 'damo/nlp_space_dialog-intent-prediction'
-    test_case = [
-        'How do I locate my card?',
-        'I still have not received my new card, I ordered over a week ago.'
-    ]
+class DialogStateTrackingTest(unittest.TestCase):
+    model_id = 'damo/nlp_space_dialog-state-tracking'
+    test_case = {}
 
-    @unittest.skip('test with snapshot_download')
     def test_run(self):
-        cache_path = snapshot_download(self.model_id)
-        preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
-        model = DialogIntentModel(
-            model_dir=cache_path,
-            text_field=preprocessor.text_field,
-            config=preprocessor.config)
-
-        pipelines = [
-            DialogIntentPredictionPipeline(
-                model=model, preprocessor=preprocessor),
-            pipeline(
-                task=Tasks.dialog_intent_prediction,
-                model=model,
-                preprocessor=preprocessor)
-        ]
-
-        for my_pipeline, item in list(zip(pipelines, self.test_case)):
-            print(my_pipeline(item))
-
-    def test_run_with_model_from_modelhub(self):
-        # model = Model.from_pretrained(self.model_id)
-        # preprocessor = DialogIntentPredictionPreprocessor(
-        #     model_dir=model.model_dir)
-        #
+        # cache_path = ''
+        # cache_path = snapshot_download(self.model_id)
+
+        # preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path)
+        # model = DialogStateTrackingModel(
+        #     model_dir=cache_path,
+        #     text_field=preprocessor.text_field,
+        #     config=preprocessor.config)
         # pipelines = [
-        #     DialogIntentPredictionPipeline(
-        #         model=model, preprocessor=preprocessor),
+        #     DialogStateTrackingPipeline(model=model, preprocessor=preprocessor),
         #     pipeline(
-        #         task=Tasks.dialog_intent_prediction,
+        #         task=Tasks.dialog_modeling,
         #         model=model,
         #         preprocessor=preprocessor)
         # ]
-        #
-        # for my_pipeline, item in list(zip(pipelines, self.test_case)):
-        #     print(my_pipeline(item))
+
+        print('jizhu test')
+
+    @unittest.skip('test with snapshot_download')
+    def test_run_with_model_from_modelhub(self):
         pass
 
 
diff --git a/tests/pipelines/test_action_recognition.py b/tests/pipelines/test_action_recognition.py
new file mode 100644
index 00000000..b524ca18
--- /dev/null
+++ b/tests/pipelines/test_action_recognition.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# !/usr/bin/env python
+import os.path as osp
+import shutil
+import tempfile
+import unittest
+
+import cv2
+
+from modelscope.fileio import File
+from modelscope.pipelines import pipeline
+from modelscope.pydatasets import PyDataset
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ActionRecognitionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_TAdaConv_action-recognition'
+
+    @unittest.skip('deprecated, download model from model hub instead')
+    def test_run_with_direct_file_download(self):
+        model_path = 'https://aquila2-online-models.oss-cn-shanghai.aliyuncs.com/maas_test/pytorch_model.pt'
+        config_path = 'https://aquila2-online-models.oss-cn-shanghai.aliyuncs.com/maas_test/configuration.json'
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model_file = osp.join(tmp_dir, ModelFile.TORCH_MODEL_FILE)
+            with open(model_file, 'wb') as ofile1:
+                ofile1.write(File.read(model_path))
+            config_file = osp.join(tmp_dir, ModelFile.CONFIGURATION)
+            with open(config_file, 'wb') as ofile2:
+                ofile2.write(File.read(config_path))
+            recognition_pipeline = pipeline(
+                Tasks.action_recognition, model=tmp_dir)
+            result = recognition_pipeline(
+                'data/test/videos/action_recognition_test_video.mp4')
+            print(f'recognition output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        recognition_pipeline = pipeline(
+            Tasks.action_recognition, model=self.model_id)
+        result = recognition_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+
+        print(f'recognition output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        recognition_pipeline = pipeline(Tasks.action_recognition)
+        result = recognition_pipeline(
+            'data/test/videos/action_recognition_test_video.mp4')
+
+        print(f'recognition output: {result}.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/nlp/test_dialog_intent_prediction.py b/tests/pipelines/test_dialog_intent_prediction.py
similarity index 96%
rename from tests/pipelines/nlp/test_dialog_intent_prediction.py
rename to tests/pipelines/test_dialog_intent_prediction.py
index 0ec4e1e7..97cdbb3d 100644
--- a/tests/pipelines/nlp/test_dialog_intent_prediction.py
+++ b/tests/pipelines/test_dialog_intent_prediction.py
@@ -1,8 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import DialogIntentModel
 from modelscope.pipelines import DialogIntentPredictionPipeline, pipeline
diff --git a/tests/pipelines/nlp/test_dialog_modeling.py b/tests/pipelines/test_dialog_modeling.py
similarity index 98%
rename from tests/pipelines/nlp/test_dialog_modeling.py
rename to tests/pipelines/test_dialog_modeling.py
index 7d4da8fe..f606ba49 100644
--- a/tests/pipelines/nlp/test_dialog_modeling.py
+++ b/tests/pipelines/test_dialog_modeling.py
@@ -4,8 +4,7 @@ import os.path as osp
 import tempfile
 import unittest
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import DialogModelingModel
 from modelscope.pipelines import DialogModelingPipeline, pipeline
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
new file mode 100644
index 00000000..49c5dc8a
--- /dev/null
+++ b/tests/pipelines/test_fill_mask.py
@@ -0,0 +1,129 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import StructBertForMaskedLM, VecoForMaskedLM
+from modelscope.pipelines import FillMaskPipeline, pipeline
+from modelscope.preprocessors import FillMaskPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class FillMaskTest(unittest.TestCase):
+    model_id_sbert = {
+        'zh': 'damo/nlp_structbert_fill-mask_chinese-large',
+        'en': 'damo/nlp_structbert_fill-mask_english-large'
+    }
+    model_id_veco = 'damo/nlp_veco_fill-mask-large'
+
+    ori_texts = {
+        'zh':
+        '段誉轻挥折扇，摇了摇头，说道：“你师父是你的师父，你师父可不是我的师父。'
+        '你师父差得动你，你师父可差不动我。',
+        'en':
+        'Everything in what you call reality is really just a reflection of your '
+        'consciousness. Your whole universe is just a mirror reflection of your story.'
+    }
+
+    test_inputs = {
+        'zh':
+        '段誉轻[MASK]折扇，摇了摇[MASK]，[MASK]道：“你师父是你的[MASK][MASK]，你'
+        '师父可不是[MASK]的师父。你师父差得动你，你师父可[MASK]不动我。',
+        'en':
+        'Everything in [MASK] you call reality is really [MASK] a reflection of your '
+        '[MASK]. Your [MASK] universe is just a mirror [MASK] of your story.'
+    }
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        # sbert
+        for language in ['zh', 'en']:
+            model_dir = snapshot_download(self.model_id_sbert[language])
+            preprocessor = FillMaskPreprocessor(
+                model_dir, first_sequence='sentence', second_sequence=None)
+            model = StructBertForMaskedLM(model_dir)
+            pipeline1 = FillMaskPipeline(model, preprocessor)
+            pipeline2 = pipeline(
+                Tasks.fill_mask, model=model, preprocessor=preprocessor)
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language]
+            print(
+                f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+                f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
+            )
+
+        # veco
+        model_dir = snapshot_download(self.model_id_veco)
+        preprocessor = FillMaskPreprocessor(
+            model_dir, first_sequence='sentence', second_sequence=None)
+        model = VecoForMaskedLM(model_dir)
+        pipeline1 = FillMaskPipeline(model, preprocessor)
+        pipeline2 = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        for language in ['zh', 'en']:
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
+            print(
+                f'\nori_text: {ori_text}\ninput: {test_input}\npipeline1: '
+                f'{pipeline1(test_input)}\npipeline2: {pipeline2(test_input)}\n'
+            )
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        # sbert
+        for language in ['zh', 'en']:
+            print(self.model_id_sbert[language])
+            model = Model.from_pretrained(self.model_id_sbert[language])
+            preprocessor = FillMaskPreprocessor(
+                model.model_dir,
+                first_sequence='sentence',
+                second_sequence=None)
+            pipeline_ins = pipeline(
+                task=Tasks.fill_mask, model=model, preprocessor=preprocessor)
+            print(
+                f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+                f'{pipeline_ins(self.test_inputs[language])}\n')
+
+        # veco
+        model = Model.from_pretrained(self.model_id_veco)
+        preprocessor = FillMaskPreprocessor(
+            model.model_dir, first_sequence='sentence', second_sequence=None)
+        pipeline_ins = pipeline(
+            Tasks.fill_mask, model=model, preprocessor=preprocessor)
+        for language in ['zh', 'en']:
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        # veco
+        pipeline_ins = pipeline(task=Tasks.fill_mask, model=self.model_id_veco)
+        for language in ['zh', 'en']:
+            ori_text = self.ori_texts[language]
+            test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
+            print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+                  f'{pipeline_ins(test_input)}\n')
+
+        # structBert
+        language = 'zh'
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=self.model_id_sbert[language])
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.fill_mask)
+        language = 'en'
+        ori_text = self.ori_texts[language]
+        test_input = self.test_inputs[language].replace('[MASK]', '<mask>')
+        print(f'\nori_text: {ori_text}\ninput: {test_input}\npipeline: '
+              f'{pipeline_ins(test_input)}\n')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_captioning.py b/tests/pipelines/test_image_captioning.py
index 74a65806..5fa6ff49 100644
--- a/tests/pipelines/test_image_captioning.py
+++ b/tests/pipelines/test_image_captioning.py
@@ -1,10 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import os
-import tempfile
 import unittest
 
-from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
@@ -12,23 +9,13 @@ from modelscope.utils.test_utils import test_level
 
 class ImageCaptionTest(unittest.TestCase):
 
-    @unittest.skip('skip before model is restored in model hub')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run(self):
-        model = 'https://ofa-beijing.oss-cn-beijing.aliyuncs.com/checkpoints/caption_large_best_clean.pt'
-
-        os.system(
-            'wget  https://jirenmr.oss-cn-zhangjiakou.aliyuncs.com/ofa/BPE.zip'
-        )
-        os.system('unzip BPE.zip')
-        bpe_dir = './BPE'
-
-        with tempfile.NamedTemporaryFile('wb', suffix='.pb') as ofile:
-            ofile.write(File.read(model))
-            img_captioning = pipeline(
-                Tasks.image_captioning, model=ofile.name, bpe_dir=bpe_dir)
-
-            result = img_captioning('data/test/images/image_matting.png')
-            print(result['caption'])
+        img_captioning = pipeline(
+            Tasks.image_captioning,
+            model='damo/ofa_image-caption_coco_large_en')
+        result = img_captioning('data/test/images/image_captioning.png')
+        print(result['caption'])
 
 
 if __name__ == '__main__':
diff --git a/tests/pipelines/test_image_matting.py b/tests/pipelines/test_image_matting.py
index 6e102d00..1b547e14 100644
--- a/tests/pipelines/test_image_matting.py
+++ b/tests/pipelines/test_image_matting.py
@@ -10,7 +10,6 @@ from modelscope.fileio import File
 from modelscope.pipelines import pipeline
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.constant import ModelFile, Tasks
-from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level
 
 
@@ -18,11 +17,6 @@ class ImageMattingTest(unittest.TestCase):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_unet_image-matting'
-        # switch to False if downloading everytime is not desired
-        purge_cache = True
-        if purge_cache:
-            shutil.rmtree(
-                get_model_cache_dir(self.model_id), ignore_errors=True)
 
     @unittest.skip('deprecated, download model from model hub instead')
     def test_run_with_direct_file_download(self):
@@ -58,7 +52,7 @@ class ImageMattingTest(unittest.TestCase):
         cv2.imwrite('result.png', result['output_png'])
         print(f'Output written to {osp.abspath("result.png")}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         img_matting = pipeline(Tasks.image_matting)
 
@@ -66,6 +60,17 @@ class ImageMattingTest(unittest.TestCase):
         cv2.imwrite('result.png', result['output_png'])
         print(f'Output written to {osp.abspath("result.png")}')
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_modelscope_dataset(self):
+        dataset = PyDataset.load('beans', split='train', target='image')
+        img_matting = pipeline(Tasks.image_matting, model=self.model_id)
+        result = img_matting(dataset)
+        for i in range(10):
+            cv2.imwrite(f'result_{i}.png', next(result)['output_png'])
+        print(
+            f'Output written to dir: {osp.dirname(osp.abspath("result_0.png"))}'
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py
new file mode 100644
index 00000000..0c8da8b4
--- /dev/null
+++ b/tests/pipelines/test_nli.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForNLI
+from modelscope.pipelines import NLIPipeline, pipeline
+from modelscope.preprocessors import NLIPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class NLITest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_nli_chinese-base'
+    sentence1 = '四川商务职业学院和四川财经职业学院哪个好？'
+    sentence2 = '四川商务职业学院商务管理在哪个校区？'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = NLIPreprocessor(cache_path)
+        model = SbertForNLI(cache_path, tokenizer=tokenizer)
+        pipeline1 = NLIPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer)
+        print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
+              f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}')
+        print()
+        print(
+            f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
+            f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = NLIPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.nli, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.nli)
+        print(pipeline_ins(input=(self.sentence1, self.sentence2)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
new file mode 100644
index 00000000..986961b7
--- /dev/null
+++ b/tests/pipelines/test_ocr_detection.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import shutil
+import sys
+import tempfile
+import unittest
+from typing import Any, Dict, List, Tuple, Union
+
+import cv2
+import numpy as np
+import PIL
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class OCRDetectionTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
+        self.test_image = 'data/test/images/ocr_detection.jpg'
+
+    def pipeline_inference(self, pipeline: Pipeline, input_location: str):
+        result = pipeline(input_location)
+        print('ocr detection results: ')
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        ocr_detection = pipeline(Tasks.ocr_detection)
+        self.pipeline_inference(ocr_detection, self.test_image)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_person_image_cartoon.py b/tests/pipelines/test_person_image_cartoon.py
index ed912b1c..f47ca008 100644
--- a/tests/pipelines/test_person_image_cartoon.py
+++ b/tests/pipelines/test_person_image_cartoon.py
@@ -42,7 +42,7 @@ class ImageCartoonTest(unittest.TestCase):
         img_cartoon = pipeline(Tasks.image_generation, model=self.model_id)
         self.pipeline_inference(img_cartoon, self.test_image)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         img_cartoon = pipeline(Tasks.image_generation)
         self.pipeline_inference(img_cartoon, self.test_image)
diff --git a/tests/pipelines/test_sentence_similarity.py b/tests/pipelines/test_sentence_similarity.py
index ac2ff4fb..df38593f 100644
--- a/tests/pipelines/test_sentence_similarity.py
+++ b/tests/pipelines/test_sentence_similarity.py
@@ -2,14 +2,12 @@
 import shutil
 import unittest
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import SbertForSentenceSimilarity
 from modelscope.pipelines import SentenceSimilarityPipeline, pipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level
 
 
@@ -18,14 +16,7 @@ class SentenceSimilarityTest(unittest.TestCase):
     sentence1 = '今天气温比昨天高么？'
     sentence2 = '今天湿度比昨天高么？'
 
-    def setUp(self) -> None:
-        # switch to False if downloading everytime is not desired
-        purge_cache = True
-        if purge_cache:
-            shutil.rmtree(
-                get_model_cache_dir(self.model_id), ignore_errors=True)
-
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(cache_path)
@@ -41,7 +32,7 @@ class SentenceSimilarityTest(unittest.TestCase):
             f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n'
             f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
         tokenizer = SequenceClassificationPreprocessor(model.model_dir)
@@ -57,7 +48,7 @@ class SentenceSimilarityTest(unittest.TestCase):
             task=Tasks.sentence_similarity, model=self.model_id)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.sentence_similarity)
         print(pipeline_ins(input=(self.sentence1, self.sentence2)))
diff --git a/tests/pipelines/test_sentiment_classification.py b/tests/pipelines/test_sentiment_classification.py
new file mode 100644
index 00000000..0ba22d5c
--- /dev/null
+++ b/tests/pipelines/test_sentiment_classification.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForSentimentClassification
+from modelscope.pipelines import SentimentClassificationPipeline, pipeline
+from modelscope.preprocessors import SentimentClassificationPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class SentimentClassificationTest(unittest.TestCase):
+    model_id = 'damo/nlp_structbert_sentiment-classification_chinese-base'
+    sentence1 = '启动的时候很大声音，然后就会听到1.2秒的卡察的声音，类似齿轮摩擦的声音'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_direct_file_download(self):
+        cache_path = snapshot_download(self.model_id)
+        tokenizer = SentimentClassificationPreprocessor(cache_path)
+        model = SbertForSentimentClassification(
+            cache_path, tokenizer=tokenizer)
+        pipeline1 = SentimentClassificationPipeline(
+            model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentiment_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1:{pipeline1(input=self.sentence1)}')
+        print()
+        print(f'sentence1: {self.sentence1}\n'
+              f'pipeline1: {pipeline2(input=self.sentence1)}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        tokenizer = SentimentClassificationPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification,
+            model=model,
+            preprocessor=tokenizer)
+        print(pipeline_ins(input=self.sentence1))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentiment_classification, model=self.model_id)
+        print(pipeline_ins(input=self.sentence1))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.sentiment_classification)
+        print(pipeline_ins(input=self.sentence1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 8b5c9468..1b070fda 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -3,9 +3,10 @@ import shutil
 import unittest
 
 from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import get_model_cache_dir
+from modelscope.utils.test_utils import test_level
 
 NEAREND_MIC_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/nearend_mic.wav'
 FAREND_SPEECH_URL = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/AEC/sample_audio/farend_speech.wav'
@@ -30,14 +31,10 @@ class SpeechSignalProcessTest(unittest.TestCase):
 
     def setUp(self) -> None:
         self.model_id = 'damo/speech_dfsmn_aec_psm_16k'
-        # switch to False if downloading everytime is not desired
-        purge_cache = True
-        if purge_cache:
-            shutil.rmtree(
-                get_model_cache_dir(self.model_id), ignore_errors=True)
         # A temporary hack to provide c++ lib. Download it first.
         download(AEC_LIB_URL, AEC_LIB_FILE)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run(self):
         download(NEAREND_MIC_URL, NEAREND_MIC_FILE)
         download(FAREND_SPEECH_URL, FAREND_SPEECH_FILE)
@@ -48,7 +45,7 @@ class SpeechSignalProcessTest(unittest.TestCase):
         aec = pipeline(
             Tasks.speech_signal_process,
             model=self.model_id,
-            pipeline_name=r'speech_dfsmn_aec_psm_16k')
+            pipeline_name=Pipelines.speech_dfsmn_aec_psm_16k)
         aec(input, output_path='output.wav')
 
 
diff --git a/tests/pipelines/test_text_classification.py b/tests/pipelines/test_text_classification.py
index 01fdd29b..9e5f15b9 100644
--- a/tests/pipelines/test_text_classification.py
+++ b/tests/pipelines/test_text_classification.py
@@ -1,17 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import shutil
 import unittest
-import zipfile
-from pathlib import Path
 
-from modelscope.fileio import File
 from modelscope.models import Model
-from modelscope.models.nlp import BertForSequenceClassification
 from modelscope.pipelines import SequenceClassificationPipeline, pipeline
 from modelscope.preprocessors import SequenceClassificationPreprocessor
 from modelscope.pydatasets import PyDataset
 from modelscope.utils.constant import Hubs, Tasks
-from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level
 
 
@@ -19,11 +14,6 @@ class SequenceClassificationTest(unittest.TestCase):
 
     def setUp(self) -> None:
         self.model_id = 'damo/bert-base-sst2'
-        # switch to False if downloading everytime is not desired
-        purge_cache = True
-        if purge_cache:
-            shutil.rmtree(
-                get_model_cache_dir(self.model_id), ignore_errors=True)
 
     def predict(self, pipeline_ins: SequenceClassificationPipeline):
         from easynlp.appzoo import load_dataset
@@ -44,31 +34,6 @@ class SequenceClassificationTest(unittest.TestCase):
                 break
             print(r)
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    def test_run(self):
-        model_url = 'https://atp-modelzoo-sh.oss-cn-shanghai.aliyuncs.com' \
-                    '/release/easynlp_modelzoo/alibaba-pai/bert-base-sst2.zip'
-        cache_path_str = r'.cache/easynlp/bert-base-sst2.zip'
-        cache_path = Path(cache_path_str)
-
-        if not cache_path.exists():
-            cache_path.parent.mkdir(parents=True, exist_ok=True)
-            cache_path.touch(exist_ok=True)
-            with cache_path.open('wb') as ofile:
-                ofile.write(File.read(model_url))
-
-        with zipfile.ZipFile(cache_path_str, 'r') as zipf:
-            zipf.extractall(cache_path.parent)
-        path = r'.cache/easynlp/'
-        model = BertForSequenceClassification(path)
-        preprocessor = SequenceClassificationPreprocessor(
-            path, first_sequence='sentence', second_sequence=None)
-        pipeline1 = SequenceClassificationPipeline(model, preprocessor)
-        self.predict(pipeline1)
-        pipeline2 = pipeline(
-            Tasks.text_classification, model=model, preprocessor=preprocessor)
-        print(pipeline2('Hello world!'))
-
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
@@ -86,18 +51,26 @@ class SequenceClassificationTest(unittest.TestCase):
             task=Tasks.text_classification, model=self.model_id)
         result = text_classification(
             PyDataset.load(
-                'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
+                'glue',
+                subset_name='sst2',
+                split='train',
+                target='sentence',
+                hub=Hubs.huggingface))
         self.printDataset(result)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         text_classification = pipeline(task=Tasks.text_classification)
         result = text_classification(
             PyDataset.load(
-                'glue', name='sst2', target='sentence', hub=Hubs.huggingface))
+                'glue',
+                subset_name='sst2',
+                split='train',
+                target='sentence',
+                hub=Hubs.huggingface))
         self.printDataset(result)
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_dataset(self):
         model = Model.from_pretrained(self.model_id)
         preprocessor = SequenceClassificationPreprocessor(
@@ -105,9 +78,21 @@ class SequenceClassificationTest(unittest.TestCase):
         text_classification = pipeline(
             Tasks.text_classification, model=model, preprocessor=preprocessor)
         # loaded from huggingface dataset
-        # TODO: rename parameter as dataset_name and subset_name
         dataset = PyDataset.load(
-            'glue', name='sst2', target='sentence', hub=Hubs.huggingface)
+            'glue',
+            subset_name='sst2',
+            split='train',
+            target='sentence',
+            hub=Hubs.huggingface)
+        result = text_classification(dataset)
+        self.printDataset(result)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_modelscope_dataset(self):
+        text_classification = pipeline(task=Tasks.text_classification)
+        # loaded from modelscope dataset
+        dataset = PyDataset.load(
+            'squad', split='train', target='context', hub=Hubs.modelscope)
         result = text_classification(dataset)
         self.printDataset(result)
 
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index fbdd165f..9df3b8bb 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -1,8 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import unittest
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
 from modelscope.models.nlp import PalmForTextGeneration
 from modelscope.pipelines import TextGenerationPipeline, pipeline
@@ -69,7 +68,7 @@ class TextGenerationTest(unittest.TestCase):
             pipeline_ins = pipeline(task=Tasks.text_generation, model=model_id)
             print(pipeline_ins(input))
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.text_generation)
         print(pipeline_ins(self.input_zh))
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index c9b988a1..e92047d6 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -1,7 +1,5 @@
-import time
 import unittest
 
-import json
 import tensorflow as tf
 # NOTICE: Tensorflow 1.15 seems not so compatible with pytorch.
 #         A segmentation fault may be raise by pytorch cpp library
@@ -10,20 +8,20 @@ import tensorflow as tf
 import torch
 from scipy.io.wavfile import write
 
-from modelscope.fileio import File
-from modelscope.models import Model, build_model
-from modelscope.models.audio.tts.am import SambertNetHifi16k
-from modelscope.models.audio.tts.vocoder import AttrDict, Hifigan16k
+from modelscope.metainfo import Pipelines, Preprocessors
+from modelscope.models import Model
 from modelscope.pipelines import pipeline
 from modelscope.preprocessors import build_preprocessor
-from modelscope.utils.constant import Fields, InputFields, Tasks
+from modelscope.utils.constant import Fields
 from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
 
 logger = get_logger()
 
 
 class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_pipeline(self):
         lang_type = 'pinyin'
         text = '明天天气怎么样'
@@ -32,7 +30,7 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):
         voc_model_id = 'damo/speech_hifigan16k_tts_zhitian_emo'
 
         cfg_preprocessor = dict(
-            type='text_to_tacotron_symbols',
+            type=Preprocessors.text_to_tacotron_symbols,
             model_name=preprocessor_model_id,
             lang_type=lang_type)
         preprocessor = build_preprocessor(cfg_preprocessor, Fields.audio)
@@ -45,7 +43,7 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase):
         self.assertTrue(voc is not None)
 
         sambert_tts = pipeline(
-            pipeline_name='tts-sambert-hifigan-16k',
+            pipeline_name=Pipelines.sambert_hifigan_16k_tts,
             config_file='',
             model=[am, voc],
             preprocessor=preprocessor)
diff --git a/tests/pipelines/test_word_segmentation.py b/tests/pipelines/test_word_segmentation.py
index 4ec2bf29..d33e4bdb 100644
--- a/tests/pipelines/test_word_segmentation.py
+++ b/tests/pipelines/test_word_segmentation.py
@@ -2,14 +2,12 @@
 import shutil
 import unittest
 
-from maas_hub.snapshot_download import snapshot_download
-
+from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models import Model
-from modelscope.models.nlp import StructBertForTokenClassification
+from modelscope.models.nlp import SbertForTokenClassification
 from modelscope.pipelines import WordSegmentationPipeline, pipeline
 from modelscope.preprocessors import TokenClassifcationPreprocessor
 from modelscope.utils.constant import Tasks
-from modelscope.utils.hub import get_model_cache_dir
 from modelscope.utils.test_utils import test_level
 
 
@@ -17,19 +15,11 @@ class WordSegmentationTest(unittest.TestCase):
     model_id = 'damo/nlp_structbert_word-segmentation_chinese-base'
     sentence = '今天天气不错，适合出去游玩'
 
-    def setUp(self) -> None:
-        # switch to False if downloading everytime is not desired
-        purge_cache = True
-        if purge_cache:
-            shutil.rmtree(
-                get_model_cache_dir(self.model_id), ignore_errors=True)
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         cache_path = snapshot_download(self.model_id)
         tokenizer = TokenClassifcationPreprocessor(cache_path)
-        model = StructBertForTokenClassification(
-            cache_path, tokenizer=tokenizer)
+        model = SbertForTokenClassification(cache_path, tokenizer=tokenizer)
         pipeline1 = WordSegmentationPipeline(model, preprocessor=tokenizer)
         pipeline2 = pipeline(
             Tasks.word_segmentation, model=model, preprocessor=tokenizer)
@@ -46,13 +36,13 @@ class WordSegmentationTest(unittest.TestCase):
             task=Tasks.word_segmentation, model=model, preprocessor=tokenizer)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
         pipeline_ins = pipeline(
             task=Tasks.word_segmentation, model=self.model_id)
         print(pipeline_ins(input=self.sentence))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.word_segmentation)
         print(pipeline_ins(input=self.sentence))
diff --git a/tests/preprocessors/test_image.py b/tests/preprocessors/test_image.py
index 21ae780e..4d66c171 100644
--- a/tests/preprocessors/test_image.py
+++ b/tests/preprocessors/test_image.py
@@ -5,7 +5,6 @@ import unittest
 import PIL
 
 from modelscope.preprocessors import load_image
-from modelscope.utils.logger import get_logger
 
 
 class ImagePreprocessorTest(unittest.TestCase):
diff --git a/tests/preprocessors/test_text_to_speech.py b/tests/preprocessors/test_text_to_speech.py
index 18b66987..fd2473fd 100644
--- a/tests/preprocessors/test_text_to_speech.py
+++ b/tests/preprocessors/test_text_to_speech.py
@@ -1,6 +1,7 @@
 import shutil
 import unittest
 
+from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors import build_preprocessor
 from modelscope.utils.constant import Fields, InputFields
 from modelscope.utils.logger import get_logger
@@ -14,7 +15,7 @@ class TtsPreprocessorTest(unittest.TestCase):
         lang_type = 'pinyin'
         text = '今天天气不错，我们去散步吧。'
         cfg = dict(
-            type='text_to_tacotron_symbols',
+            type=Preprocessors.text_to_tacotron_symbols,
             model_name='damo/speech_binary_tts_frontend_resource',
             lang_type=lang_type)
         preprocessor = build_preprocessor(cfg, Fields.audio)
diff --git a/tests/pydatasets/test_py_dataset.py b/tests/pydatasets/test_py_dataset.py
index 7accd814..e84f240a 100644
--- a/tests/pydatasets/test_py_dataset.py
+++ b/tests/pydatasets/test_py_dataset.py
@@ -2,42 +2,112 @@ import unittest
 
 import datasets as hfdata
 
+from modelscope.models import Model
+from modelscope.preprocessors import SequenceClassificationPreprocessor
+from modelscope.preprocessors.base import Preprocessor
 from modelscope.pydatasets import PyDataset
+from modelscope.utils.constant import Hubs
+from modelscope.utils.test_utils import require_tf, require_torch, test_level
 
 
-class PyDatasetTest(unittest.TestCase):
+class ImgPreprocessor(Preprocessor):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.path_field = kwargs.pop('image_path', 'image_path')
+        self.width = kwargs.pop('width', 'width')
+        self.height = kwargs.pop('height', 'width')
 
-    def setUp(self):
-        # ds1 initialized from in memory json
-        self.json_data = {
-            'dummy': [{
-                'a': i,
-                'x': i * 10,
-                'c': i * 100
-            } for i in range(1, 11)]
+    def __call__(self, data):
+        import cv2
+        image_path = data.get(self.path_field)
+        if not image_path:
+            return None
+        img = cv2.imread(image_path)
+        return {
+            'image':
+            cv2.resize(img,
+                       (data.get(self.height, 128), data.get(self.width, 128)))
         }
-        hfds1 = hfdata.Dataset.from_dict(self.json_data)
-        self.ds1 = PyDataset.from_hf_dataset(hfds1)
 
-        # ds2 initialized from hg hub
-        hfds2 = hfdata.load_dataset(
-            'glue', 'mrpc', revision='2.0.0', split='train')
-        self.ds2 = PyDataset.from_hf_dataset(hfds2)
 
-    def tearDown(self):
-        pass
+class PyDatasetTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_ds_basic(self):
+        ms_ds_full = PyDataset.load('squad')
+        ms_ds_full_hf = hfdata.load_dataset('squad')
+        ms_ds_train = PyDataset.load('squad', split='train')
+        ms_ds_train_hf = hfdata.load_dataset('squad', split='train')
+        ms_image_train = PyDataset.from_hf_dataset(
+            hfdata.load_dataset('beans', split='train'))
+        self.assertEqual(ms_ds_full['train'][0], ms_ds_full_hf['train'][0])
+        self.assertEqual(ms_ds_full['validation'][0],
+                         ms_ds_full_hf['validation'][0])
+        self.assertEqual(ms_ds_train[0], ms_ds_train_hf[0])
+        print(next(iter(ms_ds_full['train'])))
+        print(next(iter(ms_ds_train)))
+        print(next(iter(ms_image_train)))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @require_torch
+    def test_to_torch_dataset_text(self):
+        model_id = 'damo/bert-base-sst2'
+        nlp_model = Model.from_pretrained(model_id)
+        preprocessor = SequenceClassificationPreprocessor(
+            nlp_model.model_dir,
+            first_sequence='context',
+            second_sequence=None)
+        ms_ds_train = PyDataset.load('squad', split='train')
+        pt_dataset = ms_ds_train.to_torch_dataset(preprocessors=preprocessor)
+        import torch
+        dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
+        print(next(iter(dataloader)))
 
-    def test_to_hf_dataset(self):
-        hfds = self.ds1.to_hf_dataset()
-        hfds1 = hfdata.Dataset.from_dict(self.json_data)
-        self.assertEqual(hfds.data, hfds1.data)
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @require_tf
+    def test_to_tf_dataset_text(self):
+        import tensorflow as tf
+        tf.compat.v1.enable_eager_execution()
+        model_id = 'damo/bert-base-sst2'
+        nlp_model = Model.from_pretrained(model_id)
+        preprocessor = SequenceClassificationPreprocessor(
+            nlp_model.model_dir,
+            first_sequence='context',
+            second_sequence=None)
+        ms_ds_train = PyDataset.load('squad', split='train')
+        tf_dataset = ms_ds_train.to_tf_dataset(
+            batch_size=5,
+            shuffle=True,
+            preprocessors=preprocessor,
+            drop_remainder=True)
+        print(next(iter(tf_dataset)))
 
-        # simple map function
-        hfds = hfds.map(lambda e: {'new_feature': e['dummy']['a']})
-        self.assertEqual(len(hfds['new_feature']), 10)
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @require_torch
+    def test_to_torch_dataset_img(self):
+        ms_image_train = PyDataset.from_hf_dataset(
+            hfdata.load_dataset('beans', split='train'))
+        pt_dataset = ms_image_train.to_torch_dataset(
+            preprocessors=ImgPreprocessor(
+                image_path='image_file_path', label='labels'))
+        import torch
+        dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
+        print(next(iter(dataloader)))
 
-        hfds2 = self.ds2.to_hf_dataset()
-        self.assertTrue(hfds2[0]['sentence1'].startswith('Amrozi'))
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @require_tf
+    def test_to_tf_dataset_img(self):
+        import tensorflow as tf
+        tf.compat.v1.enable_eager_execution()
+        ms_image_train = PyDataset.load('beans', split='train')
+        tf_dataset = ms_image_train.to_tf_dataset(
+            batch_size=5,
+            shuffle=True,
+            preprocessors=ImgPreprocessor(image_path='image_file_path'),
+            drop_remainder=True,
+            label_cols='labels')
+        print(next(iter(tf_dataset)))
 
 
 if __name__ == '__main__':
diff --git a/tests/run.py b/tests/run.py
index a904ba8e..38c5a897 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -61,7 +61,7 @@ if __name__ == '__main__':
     parser.add_argument(
         '--test_dir', default='tests', help='directory to be tested')
     parser.add_argument(
-        '--level', default=0, help='2 -- all, 1 -- p1, 0 -- p0')
+        '--level', default=0, type=int, help='2 -- all, 1 -- p1, 0 -- p0')
     args = parser.parse_args()
     set_test_level(args.level)
     logger.info(f'TEST LEVEL: {test_level()}')
diff --git a/tests/utils/test_hub_operation.py b/tests/utils/test_hub_operation.py
deleted file mode 100644
index f432a60c..00000000
--- a/tests/utils/test_hub_operation.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path as osp
-import unittest
-
-from maas_hub.maas_api import MaasApi
-from maas_hub.repository import Repository
-
-USER_NAME = 'maasadmin'
-PASSWORD = '12345678'
-
-
-class HubOperationTest(unittest.TestCase):
-
-    def setUp(self):
-        self.api = MaasApi()
-        # note this is temporary before official account management is ready
-        self.api.login(USER_NAME, PASSWORD)
-
-    @unittest.skip('to be used for local test only')
-    def test_model_repo_creation(self):
-        # change to proper model names before use
-        model_name = 'cv_unet_person-image-cartoon_compound-models'
-        model_chinese_name = '达摩卡通化模型'
-        model_org = 'damo'
-        try:
-            self.api.create_model(
-                owner=model_org,
-                name=model_name,
-                chinese_name=model_chinese_name,
-                visibility=5,  # 1-private, 5-public
-                license='apache-2.0')
-        # TODO: support proper name duplication checking
-        except KeyError as ke:
-            if ke.args[0] == 'name':
-                print(f'model {self.model_name} already exists, ignore')
-            else:
-                raise
-
-    # Note that this can be done via git operation once model repo
-    # has been created. Git-Op is the RECOMMENDED model upload approach
-    @unittest.skip('to be used for local test only')
-    def test_model_upload(self):
-        local_path = '/path/to/local/model/directory'
-        assert osp.exists(local_path), 'Local model directory not exist.'
-        repo = Repository(local_dir=local_path)
-        repo.push_to_hub(commit_message='Upload model files')
-
-
-if __name__ == '__main__':
-    unittest.main()